Ejemplo n.º 1
0
 def fix_country(col: Column) -> Column:
     """
     Use built-in Spark functions to clean-up the "Country" column
     e.g. "   cAnAdA " -> "Canada"
     Don't forget about those annoying leading/trailing spaces.
     """
     fixed_country = F.initcap(F.trim(col))
     return fixed_country
Ejemplo n.º 2
0
def test_initcap():
    # Because we don't use the same unicode version we need to limit
    # the charicter set to something more reasonable
    # upper and lower should cover the corner cases, this is mostly to
    # see if there are issues with spaces
    gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}')
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, gen).select(f.initcap(f.col('a'))))
Ejemplo n.º 3
0
def load_police_districts(spark, path, output_path):
    """
        Get police districts from CSV and write to parquet files.
    """

    print("-- police districts --")
    df_sfpd_districts = spark.read.csv(path + "sfpd_districts.csv", header = 'true') \
        .withColumn("district_id", monotonically_increasing_id()) \
        .select("district_id", col("COMPANY").alias("company"), initcap(col("DISTRICT")).alias("district"))
    df_sfpd_districts.show()
    df_sfpd_districts.write.mode("overwrite").orc(output_path +
                                                  "police_districts")
Ejemplo n.º 4
0
 def get_cleaned_message(self, flat_message_df):
     cleaned_message_df = flat_message_df.select(
         F.col("id"), \
         F.when(F.col("first_name").isNull(), 'NULL').otherwise(F.col("first_name")).alias("first_name"), \
         F.col("last_name"), \
         F.when(F.col("email").isNull(), 'NULL').otherwise(F.col("email")).alias("email"), \
         F.col("gender"), \
         F.col("ip_address"),\
         F.date_format(F.col("date"), 'dd/MM/yyyy').alias("date"), \
         F.initcap(F.col("country")).alias("country"), \
         F.col("timestamp") \
         ).withColumn("ip_addr",F.when(F.col("ip_address").rlike(IP_REGEX),F.col("ip_address")).otherwise(DEFAULT_IP)).drop(F.col("ip_address"))
     return cleaned_message_df
Ejemplo n.º 5
0
def run():
    # Build session
    sparkql = SparkSession.builder.master('local[1]').getOrCreate()

    # Load config informaiton if __name__ == '__main__':
    people_path = config['defaults']['ch3']['ep1']['passenger_input'].get(str)
    save_path = config['defaults']['ch3']['ep1']['passenger_output'].get(str)
    bq_table = config['defaults']['ch3']['ep1']['passenger_table'].get(str)
    logger.info(f"Loading passenger info from {people_path}")

    # read csv file into spark dataframe
    passengers_df = sparkql.read.csv(people_path, header=True)
    logger.info(f"There are {passengers_df.count()} rows")

    # Load the passenger data and make sure the names have initial capitalization
    logger.info("Cleaning names and creating full name")
    passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\
                                 .withColumn('middle_name', initcap(col('middle_name')))\
                                 .withColumn('last_name', initcap(col('last_name')))

    # Create full_name column
    passengers_df = passengers_df.withColumn(
        'full_name',
        concat_ws(" ", col('first_name'), col('middle_name'),
                  col('last_name')))
    logger.info("Creating sha2 uid from email")
    # Create a sha2 uid based on the email
    passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

    logger.info(f"Saving file to {save_path}")
    # Save dataframe as a parquet file
    passengers_df.write.parquet(save_path)

    logger.info("Uploading file to BigQuery")
    # Upload the file as an external table in BigQuery
    gbq_load(bq_table, save_path)
Ejemplo n.º 6
0
    def load_passengers(self, passenger_filename, passenger_output):
        """
        Function to load the passenger data from csv in GCS, clean, add UID,
        and upload to BigQuery
        :param passenger_filename: str input file name
        :param passenger_output: str of project.dataset.table to save passenger data
        """
        self.passenger_filename = passenger_filename
        self.passenger_output = passenger_output
        people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename)

        logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}")
        passengers_df = self.sparkql.read.csv(people_path, header=True)

        # Use withColumn and initcap to standardize the names
        passengers_df = passengers_df.withColumn('first_name',
                                                 initcap(col('first_name')))\
                                     .withColumn('middle_name',
                                                 initcap(col('middle_name')))\
                                     .withColumn('last_name',
                                                 initcap(col('last_name')))

        # Create full_name column
        passengers_df = passengers_df.withColumn('full_name',
                                                 concat_ws(" ",
                                                           col('first_name'),
                                                           col('middle_name'),
                                                           col('last_name')))
        passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

        # Write to BigQuery
        logger.info(f"Writing file to {passenger_output}")
        passengers_df.write.format('bigquery') \
          .option('table', passenger_output) \
          .save()
        self.passengers_df = passengers_df
Ejemplo n.º 7
0
from pyspark.sql.functions import desc
df.orderBy(desc("salary")).show(5)
df.sort(desc("salary")).show(5)

"""操作Number"""
# 1. 取次方 透過pwd
from pyspark.sql.functions import expr, pow
expected_salary = df.select(expr("name"), pow(col("salary"),2)-10000)
# 2. 四捨五入 round()
df.describe().show()

"""操作String"""
# 1. 將首字母轉為大寫
from pyspark.sql.functions import initcap, lower,upper

df.select(initcap(col("name"))).show()
df.select(lower(col("name"))).show()
df.select(upper(col("name"))).show()

# 2. 刪除空格 : 
from pyspark.sql.functions import lpad, ltrim, rtrim, rpad, trim

# 3. 模糊查詢:
df.filter(col('name').like('%nd%')).show()

# 4. 去重:
df.select('name').dropDuplicates().show()

# 5. 分割字符串 :
df.withColumn("splited_name",split(df_webpages["name"],",")[1]).show()
Ejemplo n.º 8
0
def test_initcap_space():
    # we see a lot more space delim
    gen = StringGen('([aAbB]{0,5}[ ]{1,2}){1,5}')
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, gen).select(f.initcap(f.col('a'))))
Ejemplo n.º 9
0
        Person("Sidhartha", "Ray", 32, None, "Programmer"),
        Person("Pratik", "Solanki", 22, 176.7, None),
        Person("Ashok ", "Pradhan", 62, None, None),
        Person(" ashok", "Pradhan", 42, 125.3, "Chemical Engineer"),
        Person("Pratik", "Solanki", 22, 222.2, "Teacher")
    ])

    people_df.show()
    people_df.groupBy("firstName").agg(first("weightInLbs")).show()
    people_df.groupBy(trim(lower(col('firstName')))).agg(first("weightInLbs")).show()
    people_df.groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()
    people_df.sort(col("weightInLbs").desc()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()
    people_df.sort(col("weightInLbs").asc_nulls_last()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()

    corrected_people_df = people_df\
        .withColumn("firstName", initcap("firstName"))\
        .withColumn("firstName", ltrim(initcap("firstName")))\
        .withColumn("firstName", trim(initcap("firstName")))\

    corrected_people_df.groupBy("firstName").agg(first("weightInLbs")).show()

    corrected_people_df = corrected_people_df\
        .withColumn("fullName", format_string("%s %s", "firstName", "lastName"))\

    corrected_people_df.show()

    corrected_people_df = corrected_people_df\
        .withColumn("weightInLbs", coalesce("weightInLbs", lit(0)))\

    corrected_people_df.show()
Ejemplo n.º 10
0
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)
df.stat.crosstab("StockCode", "Quantity").show()
df.stat.freqItems(["StockCode", "Quantity"]).show()

##creating a unique sequence id

from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(10)

#working with strings

#perform case converstions

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
Ejemplo n.º 11
0
#verificando la estructura del dataframe WordlCupMatches
df_wcm.printSchema()
print('El dataframe df_wc tiene ' + str(df_wcm.count()) + ' registros.')
print('El dataframe df_wc tiene ' + str(df_wcm.distinct().count()) +
      ' registros distintos.')

# COMMAND ----------

from pyspark.sql.functions import desc, asc, col, column, expr, instr, length, substring, regexp_replace, trim, lit, initcap, sum, concat
#
#Transformaciones al datraframe WordlCupPlayers
# El campo Event guarda un dato del tipo "G43' G87'", lo que significa que el jugador usar dos goles, por lo que necesitamos contar el número de "G"
# De igual manera podemos obtener el número de penales y tarjetas
df_wcp1=df_wcp.withColumn('POSICION_JUGADOR',expr("case when position='C' THEN  'Captain' WHEN position='GK' THEN 'Goalkeeper' ELSE 'Other' end "))\
             .withColumn('NOMBRE_JUGADOR', initcap(regexp_replace('Player Name','�','u')))\
             .withColumn('NUMERO_GOLES',length('Event')-length(trim(regexp_replace('Event','G',''))))\
             .withColumn('NUMERO_PENALES',length('Event')-length(trim(regexp_replace('Event','P',''))))\
             .withColumn('NUMERO_PENALES_FALLADOS',length('Event')-length(trim(regexp_replace('Event','MP',''))))\
             .withColumn('NUMERO_TARJETAS_ROJAS',length('Event')-length(trim(regexp_replace('Event','R',''))))
#Reemplazo los valores nulos con 0
df_wcp1=df_wcp1.withColumn('NUMERO_GOLES',expr("case when NUMERO_GOLES is null then 0 else NUMERO_GOLES end "))\
              .withColumn('NUMERO_PENALES',expr("case when NUMERO_PENALES is null then 0 else NUMERO_PENALES end "))\
              .withColumn('NUMERO_PENALES_FALLADOS',expr("case when NUMERO_PENALES_FALLADOS is null then 0 else NUMERO_PENALES_FALLADOS end "))\
              .withColumn('NUMERO_TARJETAS_ROJAS',expr("case when NUMERO_TARJETAS_ROJAS is null then 0 else NUMERO_TARJETAS_ROJAS end "))\
              .withColumnRenamed('Team Initials','INICIALES_PAIS')\
              .drop('Player Name','Position','Shirt Number')

#Sumarizo  por nombre de jugador , posición e iniciales de país
df_wcp_rep=df_wcp1.select('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS','NUMERO_GOLES','NUMERO_PENALES','NUMERO_PENALES_FALLADOS','NUMERO_TARJETAS_ROJAS')\
.groupby('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS')\
Ejemplo n.º 12
0
    df = spark.read.csv(path="transactions.csv",
                        header=True,
                        inferSchema=False,
                        sep=";")

    df1 = lower_header(data_frame=df)

    df2 = df1.select(
        df1.id,
        df1.created_at,
        df1.merchant_id,
        df1.valor,
        df1.n_parcelas,
        functions.round(df1.valor / df1.n_parcelas, 2),
        functions.initcap(df1.nome_no_cartao),
        functions.when(
            functions.lower(df1.status) == "refunded", "refused"
        ).when(
            functions.lower(df1.status) == "in process", "processing"
        ).otherwise(functions.lower(df1.status)),
        df1.card_id,
        df1.iso_id,
        df1.card_brand,
        functions.regexp_replace(functions.lower(df1.documento), pattern=r"(\D+)", replacement="")
    )

    df3 = casting_fields(data_frame=df2, json_schema=get_json_schema_mapping("types_mapping.json"))

    df3.write.csv(path="output/sanitize_transactions", header=True, sep=";")
Ejemplo n.º 13
0
        .format("jdbc") \
        .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \
        .option("dbtable", "airbnb") \
        .option("user", "test") \
        .option("password", "test") \
        .option("driver", "org.postgresql.Driver") \
        .load()

df = df.withColumn('bedrooms', F.round(df['bedrooms'], 0))
df = df.filter(~F.col('city').contains("/"))
df = df.filter(~F.col('city').contains(","))
df = df.filter(~F.col('city').contains("-"))
df = df.filter(~F.col('city').contains("^[0-9]*$"))
df = df.filter(~F.col('city').contains("*"))
df = df.withColumn('city', F.ltrim(df.city))
df = df.withColumn("city", F.initcap(F.col("city")))
df = df.filter(~df.city.rlike("[ ,;{}()\n\t=]"))
df = df.filter(~df.city.rlike("[^0-9A-Za-z]"))
df = df.filter(~F.col('city').contains("("))
df = df.groupBy('city', 'bedrooms').agg(
    F.avg('average').alias('average'), F.first('state'))
df = df.withColumnRenamed('first(state)', 'state')
df = df.sort('city')
df1 = df.withColumn('average', F.round(df['average'], 0))

df1.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \
        .option("dbtable", "bnbclean") \
        .option("user", "test") \
        .option("password", "test") \
Ejemplo n.º 14
0
def init_cap_and_trim_all(column_name: str) -> Column:
    return initcap(trim_all(column_name))
Ejemplo n.º 15
0
from pyspark.sql.functions import initcap, concat_ws, col, sha2


# Build session
sparkql = SparkSession.builder.master('yarn').getOrCreate()

# Load passenger data
bucket = <your bucket>
sparkql.conf.set('temporaryGcsBucket', bucket) #this gives our job a temporary bucket to use when writint

bucket_path = 'gs://{}/'.format(bucket)
people_path = bucket_path + 'passengers_1k.csv'
passengers_df = sparkql.read.csv(people_path, header=True)

# Use withColumn and initcap to standardize the names
passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\
                             .withColumn('middle_name', initcap(col('middle_name')))\
                             .withColumn('last_name', initcap(col('last_name')))

# Create full_name column
passengers_df = passengers_df.withColumn('full_name',
                                         concat_ws(" ",
                                                   col('first_name'),
                                                   col('middle_name'),
                                                   col('last_name')))
passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256))

bq_dataset = <your dataset>
bq_table = 'passengers'
passengers_df.write.format('bigquery') \
  .option('table', '{}.{}'.format(bq_dataset, bq_table)) \
Ejemplo n.º 16
0
def test_initcap_special_chars():
    gen = mk_str_gen('ʼn([aAbB13ȺéŸ]{0,5}){1,5}')
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: unary_op_df(spark, gen).select(
                f.initcap(f.col('a'))))
Ejemplo n.º 17
0
df.selectExpr('round(2.5)', 'bround(2.5)').show(2)


# Unique ID
# We can add a unique ID to each row by using the function monotonically_increasing_id
# The function generates a unique value for each row, starting with 0

from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(2)


# Working with String

from pyspark.sql.functions import initcap, lower, upper
df.select(initcap(col('Description'))).show(2)
# select initcap(Description) from dfTable

df.select(col('Description'),
        lower(col('Description')),
        upper(col('Description'))).show(2)

df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim
    def process_biomarkers(
        self,
        biomarkers_df: DataFrame,
        source_df: DataFrame,
        disease_df: DataFrame,
        drugs_df: DataFrame
    ) -> DataFrame:
        """The diverse steps to prepare and enrich the input table"""

        biomarkers_enriched = (
            biomarkers_df
            .select(
                'Biomarker', 'IndividualMutation',
                array_distinct(split(col('Alteration'), ';')).alias('alterations'),
                array_distinct(split(col('Gene'), ';')).alias('gene'),
                split(col('AlterationType'), ';').alias('alteration_types'),
                array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'),
                array_distinct(split(col('Drug'), ';|,')).alias('drug'),
                'DrugFullName', 'Association', 'gDNA',
                array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'),
                array_distinct(split(col('Source'), ';')).alias('source')
            )
            .withColumn('confidence', explode(col('confidence')))
            .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name')))
            .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', ''))
            .withColumn('drug', explode(col('drug')))
            .withColumn('drug', translate(col('drug'), '[]', ''))
            .withColumn('gene', explode(col('gene')))
            .replace(to_replace=GENENAMESOVERRIDE, subset=['gene'])
            .withColumn('gene', upper(col('gene')))
            # At this stage alterations and alteration_types are both arrays
            # Disambiguation when the biomarker consists of multiple alterations is needed
            # This is solved by:
            # 1. Zipping both fields - tmp consists of a list of alteration/type tuples
            # 2. tmp is exploded - tmp consists of the alteration/type tuple
            # 3. alteration & alteration_type columns are overwritten with the elements in the tuple
            .withColumn(
                'tmp',
                self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types')))
            .withColumn('tmp', explode(col('tmp')))
            .withColumn('alteration_type', element_at(col('tmp'), 2))
            .withColumn(
                'alteration',
                when(
                    ~col('IndividualMutation').isNull(),
                    col('IndividualMutation')
                )
                .otherwise(element_at(col('tmp'), 1))
            )
            .drop('tmp')
            # Clean special cases on the alteration string
            .withColumn(
                'alteration',
                when(
                    col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.',
                    col('Biomarker')  # 'NRAS (12,13,59,61,117,146)'
                )
                .when(
                    # Cleans strings like 'ARAF:.'
                    col('alteration').contains(':.'),
                    translate(col('alteration'), ':.', '')
                )
                .when(
                    # Fusion genes are described with '__'
                    # biomarker is a cleaner representation when there's one alteration
                    (col('alteration').contains('__')) & (~col('Biomarker').contains('+')),
                    col('Biomarker')
                )
                .otherwise(col('alteration'))
            )
            # Split source into literature and urls
            # literature contains PMIDs
            # urls are enriched from the source table if not a CT
            .withColumn('source', explode(col('source')))
            .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source')))
            .join(source_df, on='source', how='left')
            .withColumn(
                'literature',
                when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2))
            )
            .withColumn(
                'urls',
                when(
                    col('source').startswith('NCT'),
                    struct(
                        lit('Clinical Trials').alias('niceName'),
                        concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url')
                    )
                )
                .when(
                    (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')),
                    struct(col('niceName'), col('url'))
                )
            )
            # The previous conditional clause creates a struct regardless of
            # whether any condition is met. The empty struct is replaced with null
            .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls')))
            # Enrich data
            .withColumn('functionalConsequenceId', col('alteration_type'))
            .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId'])
            .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association'])
            .join(disease_df, on='tumor_type', how='left')
            .withColumn('drug', upper(col('drug')))
            .withColumn(
                # drug class is coalesced when the precise name of the medicine is not provided
                'drug',
                when(col('drug') == '', col('DrugFullName')).otherwise(col('drug')))
            .join(drugs_df, on='drug', how='left')
            .withColumn('drug', initcap(col('drug')))
            # Translate variantId
            .withColumn(
                'variantId',
                when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA')))
            )
            # Assign a GO ID when a gene expression data is reported
            .withColumn(
                'geneExpressionId',
                when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')),
                    'GO_0010628'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')),
                    'GO_0010629'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')),
                    'GO_0010467'
                )
            )
            # Create variant struct
            .withColumn(
                'variant',
                when(
                    col('alteration_type') != 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('variantId').alias('id'),
                        col('functionalConsequenceId')
                    )
                )
            )
            # Create geneExpression struct
            .withColumn(
                'geneExpression',
                when(
                    col('alteration_type') == 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('geneExpressionId').alias('id'))
                )
            )
        )

        pre_evidence = (
            biomarkers_enriched
            .withColumn('datasourceId', lit('cancer_biomarkers'))
            .withColumn('datatypeId', lit('affected_pathway'))
            .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource')
            .withColumnRenamed('drug', 'drugFromSource')
            # diseaseFromSourceMappedId, drugId populated above
            .withColumnRenamed('Association', 'drugResponse')
            # confidence, literature and urls populated above
            .withColumnRenamed('gene', 'targetFromSourceId')
            .withColumnRenamed('Biomarker', 'biomarkerName')
            # variant, geneExpression populated above
            .drop(
                'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId',
                'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url')
        )

        # Group evidence
        self.evidence = (
            pre_evidence
            .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId',
                     'drugResponse', 'targetFromSourceId', 'diseaseFromSource',
                     'diseaseFromSourceMappedId', 'confidence', 'biomarkerName')
            .agg(
                collect_set('literature').alias('literature'),
                collect_set('urls').alias('urls'),
                collect_set('variant').alias('variant'),
                collect_set('geneExpression').alias('geneExpression'),
            )
            # Replace empty lists with null values
            .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature')))
            .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls')))
            .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant')))
            .withColumn(
                'geneExpression',
                when(size(col('geneExpression')) == 0, lit(None))
                .otherwise(col('geneExpression')))
            # Collect variant info into biomarkers struct
            .withColumn(
                'biomarkers',
                struct(
                    'variant',
                    'geneExpression'
                ))
            .drop('variant', 'geneExpression')
            .distinct()
        )

        return self.evidence
# Databricks notebook source
##문자열 데이터 타입
from pyspark.sql.functions import initcap, col, lower, upper
#data load
df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)
def etl_world_temperature(spark, input_dir, output_dir):
    """Clean the temperature data"""

    # load data
    data_input_full_file_path = f'{input_dir}/GlobalLandTemperaturesByCity.csv'
    world_temperature_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(data_input_full_file_path)

    # just take temperature data after 2003-01-01 and only keep the US data
    world_temperature_spark_df = world_temperature_spark_df \
        .filter(F.col('dt') >= datetime(2003, 1, 1)) \
        .filter(F.col('Country') == 'United States')

    # parse month and day
    us_temperature_spark_df = world_temperature_spark_df \
        .withColumn('month', F.month(F.col('dt'))) \
        .withColumn('day', F.dayofmonth(F.col('dt'))) \
        .drop(F.col('dt'))

    # groupby columns and get the new avg temperature
    avg_us_temperature_spark_df = us_temperature_spark_df \
        .groupBy(['month', 'day', 'City', 'Country', 'Latitude', 'Longitude']) \
        .agg(F.mean('AverageTemperature')) \
        .withColumnRenamed('avg(AverageTemperature)', 'AverageTemperature') \
        .withColumn('month', F.col('month').cast('integer')) \
        .withColumn('day', F.col('day').cast('integer'))

    # covert DMS Lat and Lon to numeric format to get state info with an udf func
    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumn('Latitude', F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double'))
                    .otherwise(-1*F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')))) \
        .withColumn('Longitude', F.when(F.col('Longitude').rlike('W'), -1 * F.regexp_replace('Longitude', 'W', '').cast('double'))
                    .otherwise(F.when(F.col('Longitude').rlike('W'), F.regexp_replace('Longitude', 'W', '').cast('double'))))

    # define a udf function to get state based on lat and lon by using reverse_geocoder library
    # https://github.com/thampiman/reverse-geocoder
    def _helper_get_state_(lat, lon):

        coor = (lat, lon)
        result = rg.search(coor)

        return result[0].get('admin1')

    _helper_get_state_udf = F.udf(lambda x, y: _helper_get_state_(x, y),
                                  StringType())

    avg_us_temperature_spark_df = avg_us_temperature_spark_df\
        .withColumn('state', _helper_get_state_udf(F.col('Latitude'), F.col('Longitude')))

    # load i94addr dictionary - map the i94addr values
    i94addr_dictionary_input_full_file_path = f'{input_dir}/dictionary_data/i94addr_dictionary.csv'
    i94addr_dictionary_spark_df = spark \
        .read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(i94addr_dictionary_input_full_file_path)

    i94addr_dictionary_spark_df = i94addr_dictionary_spark_df \
        .withColumn('init_cap_value', F.initcap(F.col('value')))

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .join(i94addr_dictionary_spark_df, avg_us_temperature_spark_df.state == i94addr_dictionary_spark_df.init_cap_value, 'left') \
        .drop('init_cap_value') \
        .drop('value') \
        .withColumnRenamed('key', 'state_code')

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumnRenamed("Country", "country") \
        .withColumnRenamed("City", "city") \
        .withColumnRenamed("Latitude", "latitude") \
        .withColumnRenamed("Longitude", "longitude") \
        .withColumnRenamed("AverageTemperature", "avg_temperature")

    avg_us_temperature_spark_df = avg_us_temperature_spark_df \
        .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code'))))

    avg_us_temperature_spark_df = avg_us_temperature_spark_df.select(
        'month', 'day', 'city', 'state', 'state_code', 'city_state_code',
        'avg_temperature').distinct()

    # output clean data
    data_output_full_file_path = f'{output_dir}/USCitiesTemperaturesByMonth.parquet'
    avg_us_temperature_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
Ejemplo n.º 21
0
def process_immigration(spark, input_bucket, output_bucket):
    '''
    Processing Immigration Data 

    Arguments:
        spark         = Spark Cluster
        input_bucket  = S3 Bucket name - Input 
        output_bucket = S3 Bucket name - Output 
    '''

    ## Loading Data

    df = spark.read.parquet('{}data/*.parquet'.format(input_bucket))

    ## Loading Dicts from Bucket

    # Load Port Dict
    df_port = spark.read.json(
        '{}data/i94prtl.json'.format(input_bucket)).toPandas()
    df_port=spark.createDataFrame(df_port.transpose().reset_index()\
        .rename(columns={'index':'i94port',0:'Port'}))

    # Load Origin Dict
    df_country = spark.read.json(
        '{}data/i94cntyl.json'.format(input_bucket)).toPandas()
    df_country=spark.createDataFrame(df_country.transpose().reset_index().\
        rename(columns={'index':'i94cit',0:'Origin_Country'}))

    # Load Travelmode Dict
    df_mode = spark.read.json(
        '{}data/i94model.json'.format(input_bucket)).toPandas()
    df_mode=spark.createDataFrame(df_mode.transpose().reset_index().\
        rename(columns={'index':'i94mode',0:'Travelmode'}))

    # Load Destination Dict
    df_dest = spark.read.json(
        '{}data/i94addrl.json'.format(input_bucket)).toPandas()
    df_dest=spark.createDataFrame(df_dest.transpose().reset_index().\
        rename(columns={'index':'i94addr',0:'Destination'}))

    # Load Visa Dict
    df_visa = spark.read.json(
        '{}data/i94visa.json'.format(input_bucket)).toPandas()
    df_visa=spark.createDataFrame(df_visa.transpose().reset_index().\
        rename(columns={'index':'i94visa',0:'Visa'}))

    ## Joining Data

    # Join Port
    df = df.join(df_port, df.i94port == df_port.i94port,
                 how='left').drop(df_port.i94port)
    # Join Country
    df = df.join(df_country, df.i94cit == df_country.i94cit,
                 how='left').drop(df_country.i94cit)
    # Join Travelmode
    df = df.join(df_mode, df.i94mode == df_mode.i94mode,
                 how='left').drop(df_mode.i94mode)
    # Join Destination
    df = df.join(df_dest, df.i94addr == df_dest.i94addr,
                 how='left').drop(df_dest.i94addr)
    # Join Visa Data
    df = df.join(df_visa, df.i94visa == df_visa.i94visa,
                 how='left').drop(df_visa.i94visa)

    ## Cleaning Data

    # Dropping Column
    col_to_drop = [
        'visapost', 'occup', 'entdepu', 'count', 'admnum', 'matflag',
        'dtaddto', 'insnum', 'fltno', 'i94cit', 'i94mode', 'i94addr', 'i94visa'
    ]
    df = df.drop(*col_to_drop)

    # Strip Whitespaces
    df_clean = df.withColumn('Port', trim(col('Port')))

    # Clean Invalid Ports
    df_port_clean=df_clean.filter(~col('Port').contains('No PORT Code'))\
        .filter(~col('Port').contains('Collapsed'))\
            .filter(col('Port').isNotNull())

    # Invalid Country Codes
    df_country_clean=df_port_clean.filter(~col('Origin_Country').contains('INVALID'))\
        .filter(~col('Origin_Country').contains('Collapsed'))\
            .filter(col('Origin_Country').isNotNull())

    # Remove Null-Values in Travlemode
    df_travelmode_clean = df_country_clean.filter(
        col('Travelmode').isNotNull())

    # Remove Null-Values in Destination
    df_destination_clean = df_travelmode_clean.filter(
        col('Destination').isNotNull())

    # Get City Names #

    df_city = df_destination_clean.withColumn(
        "City", initcap(split(col("Port"), ',')[0])).drop('Port')

    # Quality Check

    if df_city.count() < 1:
        raise AssertionError('Error in ETL; Immigration ROW-Count is to low!')
    else:
        logging.info('Quality Check succeded')

    ## Writing Data

    df_city.write.parquet(
        '{}processed/cleaned_immigration'.format(output_bucket),
        mode='overwrite')

    print('Data cleaned and saved')
# COMMAND ----------

df.stat.freqItems(["StockCode", "Quantity"]).show()


# COMMAND ----------

from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
"frequent items"
print(retail_df.stat.freqItems(['Quantity', 'UnitPrice']).show())

"approx quantile"
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
print(retail_df.stat.approxQuantile("UnitPrice", quantileProbs, relError))
#%%
" string manipulations "

"changing column case"
from pyspark.sql.functions import initcap, upper, lower, col

print(
    retail_df.select(initcap(col("Description")), lower(col('Description')),
                     upper(col('Description'))).show(5, False))
#%%
"regex replace"
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
print(
    retail_df.select(
        regexp_replace(col("Description"),
                       regex_string, "COLOR").alias("color_clean"),
        col("Description")).show(2))
#%%
"regex translate"
from pyspark.sql.functions import translate, lit
print(
    retail_df.select(translate(lit("Descreeption"), "leet", "1627"),
Ejemplo n.º 24
0
def compile_capitalize(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.initcap(src_column)
Ejemplo n.º 25
0
def process_i94_cit_res_data(spark: SparkSession,
                             df_right: DataFrame) -> DataFrame:
    """ Processes mapping file between countries_of_the_world.csv and i94_data.parquet """

    # Filename of input file
    filename = 's3://data-eng-capstone-cf/staging/i94_cit_res_data.csv'

    # Read into a spark dataframe
    df = spark.read.csv(filename, header=True)

    # Cast country_id as an IntegerType()
    df = df.withColumn('country_id', df.country_id.cast(IntegerType()))

    # Make country unique, by appending '(<country_id>)' to string name
    # Only do when country equals INVALID: STATELESS or INVALID: UNITED STATES
    df = df.withColumn(
        'country',
        F.when(df.country.isin('INVALID: STATELESS', 'INVALID: UNITED STATES'),
               F.concat(df.country, F.lit(' ('), df.country_id,
                        F.lit(')'))).otherwise(df.country))

    # Add foreign key column so can join to df_cow
    df = df.withColumn('country_join', F.initcap('country'))

    # Manual adjustments
    df = df.withColumn(
        'country_join',
        F.when(
            df.country ==
            'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)',
            'Mexico').when(
                df.country == 'ANTIGUA-BARBUDA', 'Antigua & Barbuda').when(
                    df.country == 'BAHAMAS', 'Bahamas, The').when(
                        df.country == 'BOSNIA-HERZEGOVINA',
                        'Bosnia & Herzegovina').when(
                            df.country == 'BRITISH VIRGIN ISLANDS',
                            'British Virgin Is.').when(
                                df.country == 'CENTRAL AFRICAN REPUBLIC',
                                'Central African Rep.').when(
                                    df.country == 'GAMBIA',
                                    'Gambia, The').when(
                                        df.country == 'GUINEA-BISSAU',
                                        'Guinea-Bissau').
        when(df.country == 'MAYOTTE (AFRICA - FRENCH)', 'Mayotte').when(
            df.country == 'MICRONESIA, FED. STATES OF',
            'Micronesia, Fed. St.').when(
                df.country == 'NORTH KOREA', 'Korea, North').when(
                    df.country == 'MICRONESIA, FED. STATES OF',
                    'Micronesia, Fed. St.').when(
                        df.country == 'MICRONESIA, FED. STATES OF',
                        'Micronesia, Fed. St.').when(
                            df.country == 'SOUTH KOREA',
                            'Korea, South').when(df.country == 'ST. HELENA',
                                                 'Saint Helena').
        when(df.country == 'ST. KITTS-NEVIS',
             'Saint Kitts & Nevis').when(
                 df.country == 'ST. LUCIA',
                 'Saint Lucia').when(
                     df.country == 'ST. PIERRE AND MIQUELON',
                     'St Pierre & Miquelon').when(
                         df.country == 'ST. VINCENT-GRENADINES',
                         'Saint Vincent and the Grenadines').when(
                             df.country == 'TRINIDAD AND TOBAGO',
                             'Trinidad & Tobago').when(
                                 df.country == 'TURKS AND CAICOS ISLANDS',
                                 'Turks & Caicos Is').when(
                                     df.country == 'WALLIS AND FUTUNA ISLANDS',
                                     'Wallis and Futuna').when(
                                         df.country == 'CHINA, PRC',
                                         'China').otherwise(df.country_join))

    # Define country_fk via left outer join
    df = df.join(df_right, df.country_join == df_right.country, how='left') \
           .select('country_id', df.country, df_right.country.alias('country_fk'))

    # Check schema and count
    df.printSchema()
    df.count()

    # Return transformed dataframe
    return df