def removePunctuation(column): no_punct = regexp_replace(column, "\p{Punct}", '') lowered = lower(no_punct) cleaned = trim(lowered) return cleaned """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ #column_val = regexp_replace(column, "\p{Punct}", "") #return trim(lower(column_val)) word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word") return word
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ # assert(isinstance(column, pyspark.sql.column.Column)) assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>") columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "") # columnNoPunct = regexp_replace(column, string.punctuation, "") columnLowerCase = lower(columnNoPunct) columnTrimmed = trim(columnLowerCase) return columnTrimmed
def colTrim(columns): exprs = [trim(col(c)).alias(c) if (c in columns) and (c in validCols) else c for (c, t) in self.__df.dtypes] self.__df = self.__df.select(*exprs)
])) id_pdv = id_pdv.dropDuplicates() # Export id_pdv.write.parquet("./id_pdv_dam") # Transactions ==== df = spark.table('x_compensation.transactions') df = (df.filter(fn.col("b21_code_pays_du_systeme_dacceptation") == 250).filter( fn.col("s04_code_operation") == 100).select( fn.col("b14_siret").alias("SRT"), fn.col("b08_environnement_reglementaire__technique_de_la_transaction"). alias("ERT"), fn.col("b15_code_activite_de_laccepteur___code_mcc").alias("MCC"), fn.trim(fn.col("b17_libelle_enseigne_commerciale")).alias("RSN"), fn.trim( fn.col("b16_numero_de_contrat_accepteur")).alias("ID_PDV_BQE"))) # fn.col("s06_identifiant_etablissement_donneur_dordre").alias("REF_ACQ"))) # Merged dfMerged = df.join(ert, df["ERT"] == ert["ert"], "left").drop(ert["ert"]) # Cleaning # Contrat must have length == 7 dfMerged = (dfMerged.withColumn( "ID_PDV_BQE", fn.when( fn.length(fn.col("ID_PDV_BQE")) == 10, fn.substring(fn.col("ID_PDV_BQE"), 3, 7)).otherwise(fn.col("ID_PDV_BQE"))))
schema = StructType([ StructField("id", IntegerType(), True), StructField("name", StringType(), True) ]) names = spark.read.schema(schema).option( "sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt") lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt") # Small tweak vs. what's shown in the video: we trim each line of whitespace as that could # throw off the counts. connections = lines.withColumn( "id", func.split(func.trim(func.col("value")), " ")[0]).withColumn( "connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1).groupBy("id").agg(func.sum("connections").alias("connections")) # Show the minimum number of connections minConnections = connections.agg( func.min('connections').alias('min_connections')) print( f'Minimum number of connections is {minConnections.first().min_connections}' ) # Show all superheroes with 1 connection connections = connections.filter(func.col('connections') == 1) connections = connections.join(names, 'id').select('name', 'connections') connections.show()
def transform(self, sources: dict) -> DataFrame: ri = self.invoice_dataframe(sources['rptt_invoice']) rst = self.read_source(source=sources['rptm_sbu_subset_txt']) cmf = self.read_source(source=sources['customer_mapping']) cmf = cmf.withColumnRenamed('sales_rep', 'sales_rep_override') cmf = cmf.withColumnRenamed('sales_rep_id', 'sales_rep_id_override') cmf = cmf.withColumnRenamed('end_market', 'cmf_end_market') mmf = self.read_source(source=sources['material_mapping']) srtr = self.read_source(source=sources['sales_rep_to_region']) rsrt = self.read_source(source=sources['rptm_sales_rep_txt']) rsrt = rsrt.withColumnRenamed('med_desc', 'sales_rep_original') edataA = self.read_source(source=sources['exclusion_dataA']) edataA = edataA.withColumnRenamed('sold_customer_id', 'edataA_sold_customer_id') # Source contains system_id/material_id pairs that need excluded excmat = self.read_source(source=sources['exclude_mat']) cerd = self.read_source(source=sources['currency_exchange_rates']) cerd = fixCurExchangeToAvg(self, cerd) cers = cerd.select('currency_code_from', 'cur_year', 'cur_month', 'conversion_rate_multiplier') cers = cers.withColumnRenamed('currency_code_from', 'std_currency_code_from') cers = cers.withColumnRenamed('cur_year', 'std_cur_year') cers = cers.withColumnRenamed('cur_month', 'std_cur_month') cers = cers.withColumnRenamed('conversion_rate_multiplier', 'std_conversion_rate_multiplier') dcust_sold = self.read_source(source=sources['dim_customer']) dcust_sold = dcust_sold.withColumnRenamed('dim_customer_id', 'sold_dim_customer_id') dcust_ship = self.read_source(source=sources['dim_customer']) dcust_ship = dcust_ship.withColumnRenamed('dim_customer_id', 'ship_dim_customer_id') dcust_brand = self.read_source(source=sources['dim_customer']) dcust_brand = dcust_brand.withColumnRenamed('dim_customer_id', 'brand_dim_customer_id') dloc_ship = self.read_source(source=sources['dim_location']) dloc_ship = dloc_ship.withColumnRenamed('dim_location_id', 'ship_from_dim_location_id') dloc_inv = self.read_source(source=sources['dim_location']) dloc_inv = dloc_inv.withColumnRenamed('dim_location_id', 'invoice_dim_location_id') dmat = self.read_source(source=sources['dim_material']) dmat = dmat.withColumnRenamed('dim_material_id', 'ship1_dim_material_id') df = (ri.join(excmat, [ excmat.material_id == ri.ship1_material_id_int, excmat.system == ri.system_id ], 'left_anti').join( rst, [rst.sbu_subset_id == ri.sbu_subset_id], 'left_outer').join( mmf, [mmf.material == ri.mmf_material], 'left_outer').join(dmat, [ dmat.billing_system == ri.system_id, dmat.material_id == ri.ship_mat1_id, dmat.end_market_or_prime == F.when( ri.prime_flag == 1, 'Prime').otherwise('Non-Prime') ], 'left_outer').join(cmf, [ F.upper(F.trim(cmf.sold_to_ship_to)) == ri.commercial_print_customer_key, F.upper(F.trim(cmf.cmf_end_market)) == F.upper( dmat.end_market) ], 'left_outer').join(srtr, [ srtr.sales_rep_id == cmf.sales_rep_id_override ], 'left_outer').join(cerd, [ cerd.currency_code_from == ri.currency_id, cerd.cur_year == ri.inv_year, cerd.cur_month == ri.inv_month ], 'left_outer').join(cers, [ cers.std_currency_code_from == ri.std_cost_currency_id, cers.std_cur_year == ri.inv_year, cers.std_cur_month == ri.inv_month ], 'left_outer').join(dcust_sold, [ dcust_sold.billing_system == ri.system_id, dcust_sold.customer_id == ri.sold_customer_id ], 'left_outer').join(dcust_ship, [ dcust_ship.billing_system == ri.system_id, dcust_ship.customer_id == ri.ship_customer_id ], 'left_outer').join(dcust_brand, [ dcust_brand.billing_system == ri.system_id, dcust_brand.customer_id == ri.brand_owner ], 'left_outer').join(dloc_ship, [ dloc_ship.location_id == ri.ship_location_id ], 'left_outer').join( dloc_inv, [dloc_inv.location_id == ri.mfg_location_id], 'left_outer').join(edataA, [ edataA.edataA_sold_customer_id == ri.sold_customer_id_lstrip_0, ri.system_id == 'S3', ri.rev_acct_id == 'R6000' ], 'left_anti').join( rsrt, [rsrt.sales_rep_id == ri.ri_sales_rep_id], 'left_outer').select( ri.system_id, ri.invoice_id, ri.line_number, ri.month, ri.source_type, ri.rev_acct_id, ri.weight_qty, ri.currency_id, ri.std_cost_currency_id, ri.inv_date, ri.quality_class, ri.sale_type, ri.invoice_line_value, ri.line_qty, ri.invoice_uom_id, ri.inv_line_std_cost, ri.period, ri.year, ri.sales_order, ri.ri_sales_rep_id, ri.line_desc1, rst.med_desc, mmf.cp_subset, cmf.channel, cmf.drop_ship_into_stock, cmf.sales_rep_override, cmf.cmf_end_market, cmf.sales_rep_id_override, cerd.conversion_rate_multiplier, cers.std_conversion_rate_multiplier, dmat.ship1_dim_material_id, dmat.product_code, dmat.force_product_code, dmat.nominal_basis_weight, dmat.material_id, dmat.end_market, dloc_ship.ship_from_dim_location_id, dloc_inv.invoice_dim_location_id, dcust_ship.ship_dim_customer_id, dcust_sold.sold_dim_customer_id, dcust_brand.brand_dim_customer_id, rsrt.sales_rep_original, srtr.region, ri.invoice_volume)) df = df.where( "case when system_id = 'S3' then product_code else '~' end not in ('SC', 'CR')" ) df = df.withColumn('iptmeta_source_system', F.lit('dataA')) df = df.withColumn('bol_number', F.lit(MISSING_NUMBER)) df = df.withColumn( 'product_sold_flag', F.when((df.weight_qty.isNull()) | (df.weight_qty == 0), F.lit('N')).otherwise(F.lit('Y'))) df = df.withColumn( 'fx_conversion_to_usd', F.coalesce( F.when(df.currency_id == 'USD', 1).otherwise( df.conversion_rate_multiplier.cast(T.DoubleType())), F.lit(MISSING_NUMBER))) df = df.withColumn( 'std_fx_conversion_to_usd', F.coalesce( F.when(df.std_cost_currency_id == 'USD', 1).otherwise( df.std_conversion_rate_multiplier.cast(T.DoubleType())), F.lit(MISSING_NUMBER))) df = df.withColumn('grade', df.product_code) df = df.withColumn('invoice_date', F.to_date(df.inv_date)) df = prime_enrich(df) df = df.withColumn('sales_order_number', F.coalesce(df.sales_order, F.lit('0'))) df = df.withColumn( 'sale_type', F.when(df.sale_type == 'I', F.lit('Internal')).when( df.sale_type == 'E', F.lit('External')).otherwise(df.sale_type)) df = df.withColumn( 'subset', F.coalesce(df.cp_subset, df.med_desc, F.lit(NOT_APPLICABLE_DESC))) df = ( df.withColumn( 'claims', F.when(df.rev_acct_id.isin('R4900', 'R4350'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'discounts', F.when(df.rev_acct_id.isin('R4500'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn("freight_invoice_calc", F.lit('actual')).withColumn( 'freight_invoice', F.when(df.rev_acct_id.isin('R8200'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'freight_upcharge', F.when(df.rev_acct_id.isin('R0300'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn( 'gross_price', F.when(df.rev_acct_id.isin('R0100', 'R0500', 'R0700', 'R0105'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'other_deductions', F.when(df.rev_acct_id.isin('R5300'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn( 'standard_cost', F.coalesce( df.inv_line_std_cost * df.std_fx_conversion_to_usd, F.lit(MISSING_NUMBER))).withColumn( 'rebates', F.when( df.rev_acct_id.isin( 'R4110', 'R4130'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)) # TODO Confirm exclusions and/or data predicate should be here .withColumn( 'service_allowances', F.when(df.rev_acct_id.isin('R6000'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER))) df = df.withColumn( 'msf', F.when(df.invoice_uom_id == 'MSF', df.line_qty).when(df.invoice_uom_id == 'M2', df.line_qty * .0107639).otherwise(0)) df = df.withColumn('nominal_tons', df.nominal_basis_weight * df.msf / 2000) df = df.withColumn( 'net_price', df.gross_price + df.discounts + df.rebates + df.claims + df.freight_upcharge + df.other_deductions + df.service_allowances) df = df.withColumn( 'standard_gross_margin', df.net_price - (df.standard_cost + df.freight_invoice)) df = dataA_sales_rep_override(df) df = df.withColumn( 'sales_rep_id', F.coalesce(df.sales_rep_id_override, df.ri_sales_rep_id, F.lit(MISSING_NUMBER))) df = ( df.withColumn( 'ship_from_dim_location_id', F.coalesce( df.ship_from_dim_location_id, F.lit(MISSING_STRING_ID))).withColumn( 'invoice_dim_location_id', F.coalesce( df.invoice_dim_location_id, F.lit(MISSING_STRING_ID))).withColumn( 'ship1_dim_material_id', F.coalesce( df.ship1_dim_material_id, F.lit(MISSING_STRING_ID))).withColumn( 'channel', F.coalesce( df.channel, F.lit(MISSING_DESC))).withColumn( 'drop_ship_into_stock', F.coalesce( df.drop_ship_into_stock, F.lit(MISSING_DESC))). withColumn('region', F.coalesce( df.region, F.lit(MISSING_DESC))).withColumn( 'ship_dim_customer_id', F.coalesce( df.ship_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'sold_dim_customer_id', F.coalesce( df.sold_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'brand_dim_customer_id', F.coalesce( df.brand_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'invoice_period', F.lpad(df.month, 6, '0'))) df = (df.withColumnRenamed( 'system_id', 'billing_system').withColumnRenamed( 'rev_acct_id', 'invoice_line_code').withColumnRenamed( 'invoice_id', 'invoice_number').withColumnRenamed( 'line_number', 'invoice_line_number').withColumnRenamed( 'source_type', 'invoice_source_type').withColumnRenamed( 'channel', 'commercial_print_channel').withColumnRenamed( 'drop_ship_into_stock', 'commercial_print_mode').withColumnRenamed( 'region', 'commercial_print_region'). withColumnRenamed('currency_id', 'invoiced_currency').withColumnRenamed( 'weight_qty', 'actual_tons').withColumnRenamed( 'period', 'report_month').withColumnRenamed( 'year', 'report_year').withColumnRenamed( 'line_desc1', 'invoice_line_desc_1')) df = df.select( df.billing_system, df.invoice_number, df.invoice_line_number, df.invoice_period, df.invoice_source_type, df.invoice_line_code, df.iptmeta_source_system, df.product_sold_flag, df.commercial_print_channel, df.commercial_print_mode, df.fx_conversion_to_usd, df.grade, df.invoice_date, df.ship_from_dim_location_id, df.invoiced_currency, df.ship1_dim_material_id, df.prime, df.sales_order_number, df.sale_type, df.sales_representative, df.ship_dim_customer_id, df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset, df.actual_tons, df.claims, df.discounts, df.freight_invoice, df.freight_invoice_calc, df.freight_upcharge, df.gross_price, df.msf, df.net_price, df.nominal_tons, df.other_deductions, df.rebates, df.service_allowances, df.standard_cost, df.standard_gross_margin, df.invoice_dim_location_id, df.commercial_print_region, df.invoice_volume, df.invoice_uom_id, df.bol_number, df.report_month, df.report_year, df.sales_rep_id, df.invoice_line_desc_1) return df
def extract_client_df(self): client_dictionary = { 'card_dim': self.client.customers().data.select( col('card_code').alias(self.config_dict['identity_type_code']), 'card_id', 'card_birth_date', 'card_termination_date', 'card_address_valid_flag', 'card_address_country_code', 'card_analyse_now_suppress_flag', 'card_suppress_flag'), 'desc_dt': self.sqlContext. table('market_x_datalake.7_market_smartclub_members_c').select( F.concat(F.lit('0'), F.col('member_id')).alias("mem_id"), 'deceased_date').filter((F.col('deceased_date') == '00000000') | (F.col('deceased_date').isNull())), 'prod_dim': self.client.products().data.select('prod_code', 'prod_hier_l20_code', 'prod_desc'), 'store_dim': self.client.stores().data.select('banner_name', 'store_code'), 'dur_period': self.client.items(fisWeekId=self.config_dict['event_end'], weeks=self.config_dict['event_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), 'prod_code', col('net_spend_amt').cast(IntegerType()).alias('spend'), 'prod_id', col('transaction_code').alias('transaction_fid'), 'transaction_dttm', 'fis_week_id').filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & (trim(col('prod_code')) != '') & col('fis_week_id').between( str(self.config_dict['event_start']), str(self.config_dict['event_end']))), 'pre_period': self.client.items(fisWeekId=self.config_dict['pre_end'], weeks=self.config_dict['pre_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), 'prod_code', col('transaction_code').alias('transaction_fid'), 'prod_id', col('net_spend_amt').cast(IntegerType()).alias('spend'), 'fis_week_id').filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & (trim(col('prod_code')) != '') & col('fis_week_id').between( str(self.config_dict['pre_start']), str(self.config_dict['pre_end']))), 'post_period': self.client.items(fisWeekId=self.config_dict['post_end'], weeks=self.config_dict['post_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), 'prod_code', col('transaction_code').alias('transaction_fid'), 'prod_id', col('net_spend_amt').cast(IntegerType()).alias('spend'), 'fis_week_id').filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & (trim(col('prod_code')) != '') & col('fis_week_id').between( str(self.config_dict['post_start']), str(self.config_dict['post_end']))), 'dur_period_basket': self.client.baskets(fisWeekId=self.config_dict['event_end'], weeks=self.config_dict['event_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), col('basket_spend_amt').cast(IntegerType()).alias('spend'), col('basket_item_qty').cast( IntegerType()).alias('basket_item_qty'), col('basket_item_qty').alias('item'), col('transaction_code').alias('transaction_fid'), 'transaction_dttm', 'fis_week_id').filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & col('fis_week_id').between( str(self.config_dict['event_start']), str(self.config_dict['event_end']))), 'pre_period_basket': self.client.baskets(fisWeekId=self.config_dict['pre_end'], weeks=self.config_dict['pre_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), col('transaction_code').alias('transaction_fid'), col('basket_spend_amt').cast(IntegerType()).alias('spend'), col('basket_item_qty').cast( IntegerType()).alias('basket_item_qty'), col('basket_item_qty').alias('item'), 'fis_week_id').filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & col('fis_week_id').between( str(self.config_dict['pre_start']), str(self.config_dict['pre_end']))), 'post_period_basket': self.client.baskets(fisWeekId=self.config_dict['post_end'], weeks=self.config_dict['post_weeks']).data. select( col('card_code').alias(self.config_dict['identity_type_code']), col('transaction_code').alias('transaction_fid'), col('basket_spend_amt').cast(IntegerType()).alias('spend'), col('basket_item_qty').cast( IntegerType()).alias('basket_item_qty'), 'fis_week_id', col('basket_item_qty').alias('item')).filter( col(self.config_dict['identity_type_code']).isNotNull() & (trim(col(self.config_dict['identity_type_code'])) != '') & col('fis_week_id').between( str(self.config_dict['post_start']), str(self.config_dict['post_end']))), 'date_dim': self.client.calendar().data } BaseModule._dict.update(client_dictionary)
.option("inferSchema", "false") .schema(tasteprofile_schema) .load("hdfs:///data/msd/tasteprofile/triplets.tsv") ) tasteprofile.show(20, False) # Load in the mismatched data. mismatches_text = ( spark.read.format("text") .load('hdfs:///data/msd/tasteprofile/mismatches/sid_mismatches.txt') ) # Parse the fixed width text data to format it. mismatches = mismatches_text.select( F.trim(F.col('value').substr(9, 18)).alias('Song_ID').cast(StringType()), F.trim(F.col('value').substr(28, 18)).alias('Track_ID').cast(StringType()) ) mismatches.show(20, False) # Load in the accepted mismatched data. mismatches_accepted_text = ( spark.read.format("text") .load('hdfs:///data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt') ) # Parse the fixed width text data to format it. mismatches_accepted = mismatches_accepted_text.select( F.trim(F.col('value').substr(11, 18)).alias('Song_ID').cast(StringType()), F.trim(F.col('value').substr(30, 18)).alias('Track_ID').cast(StringType())
def remove_multiple_spaces(col: Column): """Replace multiple spaces with single spaces""" return F.trim(F.regexp_replace(col, " +", " "))
upper(col('Description'))).show(2) df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(' HELLO ')).alias('ltrim'), rtrim(lit(' HELLO ')).alias('rtrim'), trim(lit(' HELLO ')).alias('trim'), lpad(lit('HELLO'), 3, ' ').alias('lp'), rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2) df.selectExpr( 'ltrim( "HELLO" ) as ltrim', 'rtrim( "HELLO" ) as rtrim', 'trim( "HELLO" )as trim', 'lpad("HELLO", 3, " ") as lp', 'rpad("HELLO", 3, " ")as rp').show(2) # select # ltrim(' HELLO '), # rtrim(' HELLO '), # trim(' HELLO '), # lpad('HELLO', 3, ' '),
from pyspark.sql.functions import upper, col, regexp_extract, regexp_replace #----------------------------------- #DOWNTOWN #----------------------------------- dt = '/Users/valerieangulo/Downtown/fuzzymatching/songtrust_match.csv' dtdf = sqlContext.read.format("com.databricks.spark.csv").option( "header", "true").option("inferSchema", "true").load(dt) mydt = dtdf mydt = dtdf[['Custom ID', 'Title', 'COMPOSER']] #mydt = mydt.withColumn('ratio', F.lit(0)) #dont need if we join DFs with levenshtein sortdt = mydt #trim whitespaces sortdt = sortdt.withColumn('Title', F.trim(sortdt.Title)) sortdt = sortdt.withColumn('COMPOSER', F.trim(sortdt.COMPOSER)) #DT tabs sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '[\t]+', '')) sortdt = sortdt.withColumn("COMPOSER", regexp_replace(col("COMPOSER"), '[\t]+', '')) #DT new lines sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '[\n]+', '')) sortdt = sortdt.withColumn("COMPOSER", regexp_replace(col("COMPOSER"), '[\n]+', '')) #make caps sortdt = sortdt.withColumn('Title', F.upper( col('Title'))) #test(F.upper(['Title', 'COMPOSER'])) sortdt = sortdt.withColumn('COMPOSER', F.upper(col('COMPOSER'))) #remove quotes, unknowns, |, - sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '"', ''))
# Databricks notebook source # CLASIFICACION BINARIA (0: vuelo en hora, 1: vuelo retrasado) # COMMAND ---------- #PARTE1: DEFINIMOS LOS TIPOS DE DATOS DEL ARCHIVO TRAIN #Leemos el archivo csv con las cabeceras, importandolo primero todo como string df = sqlContext.read.format("csv").option("header", "true").load("dbfs:/dataset/datos_preprocesados.csv") from pyspark.sql.types import * from pyspark.sql.functions import trim, col df=df.select(trim(col("MONTH")).cast(IntegerType()).alias("MONTH"), trim(col("HOLIDAYS")).cast(IntegerType()).alias("HOLIDAYS"), trim(col("DAY_OF_MONTH")).cast(IntegerType()).alias("DAY_OF_MONTH"), trim(col("DAY_OF_WEEK")).cast(IntegerType()).alias("DAY_OF_WEEK"), trim(col("UNIQUE_CARRIER")).cast(IntegerType()).alias("UNIQUE_CARRIER"), trim(col("TAIL_NUM")).cast(IntegerType()).alias("TAIL_NUM"), trim(col("FL_NUM")).cast(IntegerType()).alias("FL_NUM"), trim(col("ORIGIN_AIRPORT_ID")).cast(IntegerType()).alias("ORIGIN_AIRPORT_ID"), trim(col("ORIGIN_CITY_MARKET_ID")).cast(IntegerType()).alias("ORIGIN_CITY_MARKET_ID"), trim(col("ORIGIN_STATE_NM")).cast(IntegerType()).alias("ORIGIN_STATE_NM"), trim(col("DEST_AIRPORT_ID")).cast(IntegerType()).alias("DEST_AIRPORT_ID"), trim(col("DEST_CITY_MARKET_ID")).cast(IntegerType()).alias("DEST_CITY_MARKET_ID"), trim(col("DEST_STATE_NM")).cast(IntegerType()).alias("DEST_STATE_NM"), trim(col("CRS_DEP_TIME")).cast(IntegerType()).alias("CRS_DEP_TIME"), trim(col("DEP_TIME")).cast(IntegerType()).alias("DEP_TIME"), trim(col("DEP_DELAY")).cast(IntegerType()).alias("DEP_DELAY"), trim(col("DEP_DELAY_NEW")).cast(IntegerType()).alias("DEP_DELAY_NEW"), trim(col("DEP_DEL15")).cast(IntegerType()).alias("DEP_DEL15"), trim(col("DEP_DELAY_GROUP")).cast(IntegerType()).alias("DEP_DELAY_GROUP"),
def to_null(c): return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))
def _trim(col_name, args): return F.trim(F.col(col_name))
import os from pyspark.sql.functions import udf, trim, lower from pyspark.sql.types import StringType s3 = "s3a://shwes3udacapstone/" DEMOGRAPHICS_DATA_PATH = "data/raw/demographics/us-cities-demographics.csv" input_log_data_file = os.path.join(s3, DEMOGRAPHICS_DATA_PATH) udf_capitalize_lower = udf(lambda x: str(x).lower().capitalize(), StringType()) df_demo = spark.read.format("csv").option("delimiter", ";").option( "header", "true").option("encoding", "UTF-8").load(input_log_data_file) df_demo = df_demo.withColumnRenamed("State Code", "state_code").withColumnRenamed( "Median Age", "median_age").withColumnRenamed( "City", "city").withColumnRenamed( "Total Population", "population") df_demo = df_demo.select("city", "state_code", "median_age", "population") df_state = spark.read.parquet(s3 + "data/processed/codes/us_state") df_demo = df_demo.join(df_state, ["state_code"]) df_demo = df_demo.withColumn("city", lower(trim(df_demo.city))) df_demo.write.mode("overwrite").parquet(s3 + 'data/processed/city/')
from pyspark.sql.functions import trim import pandas as pd import cdsw #initalize Spark Session spark = SparkSession.builder \ .appName("Telco Customer Churn SVM") \ .config('spark.shuffle.service.enabled',"True") \ .getOrCreate() #Define Dataframe Schema schemaData = StructType([StructField("state", StringType(), True),StructField("account_length", DoubleType(), True),StructField("area_code", StringType(), True),StructField("phone_number", StringType(), True),StructField("intl_plan", StringType(), True),StructField("voice_mail_plan", StringType(), True),StructField("number_vmail_messages", DoubleType(), True), StructField("total_day_minutes", DoubleType(), True), StructField("total_day_calls", DoubleType(), True), StructField("total_day_charge", DoubleType(), True), StructField("total_eve_minutes", DoubleType(), True), StructField("total_eve_calls", DoubleType(), True), StructField("total_eve_charge", DoubleType(), True), StructField("total_night_minutes", DoubleType(), True), StructField("total_night_calls", DoubleType(), True), StructField("total_night_charge", DoubleType(), True), StructField("total_intl_minutes", DoubleType(), True), StructField("total_intl_calls", DoubleType(), True), StructField("total_intl_charge", DoubleType(), True), StructField("number_customer_service_calls", DoubleType(), True), StructField("churned", StringType(), True)]) #Build Dataframe from File raw_data = spark.read.schema(schemaData).csv('/tmp/churn.all') churn_data=raw_data.withColumn("intl_plan",trim(raw_data.intl_plan)) reduced_numeric_cols = ["account_length", "number_vmail_messages", "total_day_charge", "total_eve_charge", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls"] reduced_numeric_cols1 = ["account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls"] #Review DataSet Balance churn_data.registerTempTable("ChurnData") sqlResult = spark.sql("SELECT churned, COUNT(churned) as Churned FROM ChurnData group by churned") sqlResult.show()
def trim(col: Column): """Trim a string column""" return F.trim(col)
def format_name_column(column_name) -> Column: return trim(col(column_name))
def trim_leading_zeros(col: Column): """Trim the leading zeros from a string column""" return F.trim(F.regexp_replace(col, "^0*", " "))
def get_transformed_edges(graph_specification, spark_config, input_edge_path, input_source_col, input_target_col, output_source_col, output_target_col, output_tag_col, data_format='parquet', array_delimiter=';', max_result_size=1e9): """ A generator that returns a Panda data frame of each processed edge in the graph specification :param graph_specification: Graph specification. :type graph_specification: fncore.utils.graph_specification.GraphSpec :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param input_edge_path: Path to input edge files for this graph. :type input_edge_path: str :param output_source_col: Column name to use for source id. :type output_source_col: str :param output_target_col: Column name to use for target id. :type output_target_col: str :param output_tag_col: Column name to use for node tag. :type output_tag_col: str :param data_format: Format to read and write files for this graph. :type data_format: str :param array_delimiter: Delimiter used to separate items in array :type array_delimiter: str :param max_result_size: Maximum result size that spark driver accept :type max_result_size: int """ for edge_kind in graph_specification.edge_lists: with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) data = (sql_context.read.format(data_format).option( 'header', 'true').option('inferschema', 'true').load( os.path.join(input_edge_path, edge_kind.safe_name))) edge_kind_columns = ( edge_kind.metadata_columns + [edge_kind.source_column] + [edge_kind.target_column] + ([edge_kind.index_column] if edge_kind.index_column else []) + ([edge_kind.weight_column] if edge_kind.weight_column else [])) transformed = data # Drops duplicates (if index column does not exist) # TODO: Support multi field index in the future if not edge_kind.index_column: dedup_columns = ([edge_kind.source_column.safe_name] + [edge_kind.target_column.safe_name]) transformed = transformed.dropDuplicates(subset=dedup_columns) for column in edge_kind_columns: transformed = transformed.withColumnRenamed( column.safe_name, column.friendly_name or column.name) edge_tags = array_delimiter.join(edge_kind.tags) transformed = (transformed.withColumn( output_source_col, trim(transformed[input_source_col])).withColumn( output_target_col, trim(transformed[input_target_col])).withColumn( output_tag_col, lit(edge_tags))) transformed = (transformed.dropna( how='any', subset=[output_source_col, output_target_col ]).filter(transformed[output_source_col] != '').filter( transformed[output_target_col] != '')) for dataframe in to_pandas_iterator( transformed, max_result_size=max_result_size): yield dataframe
df.stat.crosstab("StockCode", "Quantity").show() # 查看频繁项 df.stat.freqItems(["StockCode", "Quantity"]).show(2, False) # 为每一行生成唯一ID df.select(monotonically_increasing_id().alias("id")).show(2) # 处理字符串类型 # initcap将空格分隔的字符串的单词首字母大写 df.select(initcap(col("Description"))).show() # 字符串大小写转换 df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # 字符串删除空格或者在其周围添加空格,lpad或rpad根据输入参数值与输入字符串长度比较,决定删除字符串长度 df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # Spark使用这则表达式过滤字符串 regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2, False) # translate替换字符串,对比入参逐个字符替换,例如下面例子,L-1,E-3, T-7 df.select(translate(col("Description"), "LEET", "1337"), col("Description")) \ .show(2, False) # regexp_extract用于提取执行出现顺序的字符串,下面例子中extract_str任意单词出现在第1个位置则被提取 extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2)
# getting document counts, where asin is considered the document # doc_counts = reviews.groupBy('asin').count() # print('doc counts') # doc_counts.show() # now to split up our reviewText into lines # doc_and_lines = reviews.select('asin', fnc.split('reviewText', '[\W_]+').alias('a_line')) # print('doc and lines') # doc_and_lines.show() # and now further split it into words doc_and_words = reviews \ .select('asin', fnc.explode(fnc.split('reviewText', '[\W_]+')).alias('each_word')) \ .filter(fnc.length('each_word') >0) \ .select('asin', fnc.trim(fnc.lower(fnc.col('each_word'))).alias('each_word'))\ # print('doc and words') # doc_and_words.show() # get counts of each word # word_counts = doc_and_words.groupBy('each_word') \ # .count() # print('word counts') # word_counts.show() # now to get term frequency using the formula of (term in a doc)/(total num of words in that doc) wind = Window.partitionBy(doc_and_words['asin']) tf = doc_and_words.groupBy('asin', 'each_word')\
def single_space(col): return F.trim(F.regexp_replace(col, " +", " "))
# In[3]: from pyspark.sql.functions import col, expr, udf, trim from pyspark.sql.types import IntegerType import re remove_punctuation = udf(lambda line: re.sub('[^A-Za-z\s]', '', line)) make_binary = udf(lambda rating: 0 if rating in [1, 2] else 1, IntegerType()) reviews = (all_reviews .na.fill({ 'reviewerName': 'Unknown' }) .filter(col('overall').isin([1, 2, 5])) .withColumn('label', make_binary(col('overall'))) .select(col('label').cast('int'), remove_punctuation('summary').alias('summary')) .filter(trim(col('summary')) != '')) # ## Splitting data and balancing skewness # In[4]: train, test = reviews.randomSplit([.8, .2], seed=5436L) # In[5]: def multiply_dataset(dataset, n): return dataset if n <= 1 else dataset.union(multiply_dataset(dataset, n - 1))
from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate() schema = StructType([ \ StructField("id", IntegerType(), True), \ StructField("name", StringType(), True)]) names = spark.read.schema(schema).option("sep", " ").csv("Marvel+Names") lines = spark.read.text("Marvel+graph") # Small tweak vs. what's shown in the video: we trim each line of whitespace as that could # throw off the counts. connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \ .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \ .groupBy("id").agg(func.sum("connections").alias("connections")) minConnectionCount = connections.agg(func.min("connections")).first()[0] minConnections = connections.filter( func.col("connections") == minConnectionCount) minConnectionsWithNames = minConnections.join(names, "id") print("The following characters have only " + str(minConnectionCount) + " connections(s):") minConnectionsWithNames.select("name").show() spark.stop()
StructField("OBSERVATION_TIME", StringType(), True) ]) daily_all = ( spark.read.format("com.databricks.spark.csv") .option("header", "false") .option("inferSchema", "false") .schema(schema_daily) .load("hdfs:///data/ghcnd/daily/") ) daily_all.show(5, False) # Extract YEAR from DATE daily_all = ( daily_all .withColumn('YEAR', F.trim(F.substring(F.col('DATE'), 1, 4)).cast(StringType())) ) daily_all.show(5, False) # Get a subset of daily with other elements core_element = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'] daily_other = ( daily_all .filter(~F.col('ELEMENT').isin(core_element)) ) daily_other.show(5, False) # check the count of daily obersevation by element daily_by_element = ( daily_other
daily = (spark.read.format("com.databricks.spark.csv").option( "header", "false").option("inferSchema", "false").schema( schema_Daily).load("hdfs:///data/ghcnd/daily/2020.csv.gz").limit(1000)) daily.cache() daily.show(5) #----Q2-C----load metadata--------------- # load text countries_text = ( spark.read.format("text").load("hdfs:///data/ghcnd/countries")) countries_text.show(5) #parse countries = countries_text.select( F.trim(F.substring(F.col('value'), 1, 2)).alias('CODE').cast( schema_Countries['CODE'].dataType), #1-2 F.trim(F.substring(F.col('value'), 4, 47)).alias('NAME').cast( schema_Countries['NAME'].dataType) #4-50 3 space ) countries.show(5) countries.count() inventory_text = ( spark.read.format("text").load("hdfs:///data/ghcnd/inventory")) inventory_text.show(5) inventory = inventory_text.select( F.trim(F.substring(F.col('value'), 1, 11)).alias('ID').cast(schema_Inventory['ID'].dataType), F.trim(F.substring(F.col('value'), 13, 8)).alias('LATITUDE').cast( schema_Inventory['LATITUDE'].dataType), F.trim(F.substring(F.col('value'), 22, 9)).alias('LONGITUDE').cast(
def transform_airport_dataset(self): df_airport = spark.read.format("csv").option("header",True).load('../workspace/immigration_files/airport/airport-codes_csv.csv') not_null_iata_in_us_df = df_airport.where("iso_country = 'US' and iata_code is not null") not_null_iata_in_us_df = not_null_iata_in_us_df.withColumn("ident", trim(not_null_iata_in_us_df.ident)) \ .withColumn("type", trim(not_null_iata_in_us_df.type)) \ .withColumn("name", trim(not_null_iata_in_us_df.name)) \ .withColumn("elevation_ft", trim(not_null_iata_in_us_df.elevation_ft)) \ .withColumn("continent", trim(not_null_iata_in_us_df.continent)) \ .withColumn("iso_country", trim(not_null_iata_in_us_df.iso_country)) \ .withColumn("iso_region", trim(not_null_iata_in_us_df.iso_region)) \ .withColumn("municipality", trim(not_null_iata_in_us_df.municipality)) \ .withColumn("gps_code", trim(not_null_iata_in_us_df.gps_code)) \ .withColumn("iata_code", trim(not_null_iata_in_us_df.iata_code)) \ .withColumn("local_code", trim(not_null_iata_in_us_df.local_code)) \ .withColumn("coordinates", trim(not_null_iata_in_us_df.coordinates)) not_null_iata_in_us_df=not_null_iata_in_us_df.withColumn("name", regexp_replace('name', "\\", "")) not_null_iata_in_us_df.write\ .csv(path = save_path + '/airport/',mode='overwrite', header=True)
#initiate glue context glueContext = GlueContext(SparkContext.getOrCreate()) ##Loading the source files from s3 bucket and converting the corresponding dynamic dataframes to apache spark data frames Jas_Labour_Paid_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_paid_export.csv"]}, format="csv",format_options={'withHeader' : True}) Jas_Labour_Paid_df = Jas_Labour_Paid_dyf.toDF() Jas_Labour_Production_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_production_export.csv"]}, format="csv",format_options={'withHeader' : True}) Jas_Labour_Production_df = Jas_Labour_Production_dyf.toDF() ##Performing Transformations on Jas_Labour_Paid_df ##Jas_Labour_Paid_df.printSchema() Jas_Labour_Paid_df=Jas_Labour_Paid_df.select(col('EXTERNAL_ID').alias('EMPLOYEE_NUMBER'),\ col('USER_NAME').alias('EMPLOYEE_NAME'),'ATTENTION',\ trim(split(col('DEPT_NAME'),'-')[0]).alias('PROGRAM_DESC'),\ to_date(substring(col('TIME_START'),1,9),'dd-MMM-yy').alias('TRANSACTION_DATE'),\ col('HOURS_OVER_TIME').cast(DoubleType()).alias('AVAIL_OT_HRS'),\ col('HOURS_TOTAL').cast(DoubleType()).alias('HOURS_TOTAL'),'STATUS',\ col('HOURS_INDIRECT').cast(DoubleType()).alias('HOURS_INDIRECT'),\ col('HOURS_TIMED').cast(DoubleType()).alias('HOURS_TIMED'), 'TAC_CODE')\ .withColumn('DIRECT_INDIRECT', when(col('HOURS_INDIRECT')>0, 'INDIRECT').otherwise('DIRECT'))\ .withColumn('KEY_COL', concat(col('TRANSACTION_DATE'), lit('-'),col('EMPLOYEE_NUMBER')))\ .withColumn('WEEK_NUMBER', weekofyear(col('TRANSACTION_DATE')))\ .distinct() Jas_Labour_Paid_df=Jas_Labour_Paid_df.filter(Jas_Labour_Paid_df['TRANSACTION_DATE'] >= lit("2019-01-01"))\ .filter(Jas_Labour_Paid_df['TRANSACTION_DATE'] <= lit("2019-02-21")) ##Jas_Labour_Paid_df.printSchema() ##Performing Transformations on Jas_Labour_Production_df
#Append and select data # import pandas as pd appended_data = add_category_fake.union(add_category_true)\ .select(['category', 'text'])\ .dropna(subset=('text')) # appended_data.show() from pyspark.sql.functions import length, trim # Create a length column to be used as a future feature review_data = appended_data.withColumn('length', length(appended_data['text']))\ .where("length>=100")\ .orderBy('length')\ .withColumn("text", trim(appended_data.text)) # review_data.show() from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer # Create all the features to the data set pos_neg_to_num = StringIndexer(inputCol='category', outputCol='label') tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token') idf = IDF(inputCol='hash_token', outputCol='idf_token') from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector # Create feature vectors
def main_cast(): cast_csv = 'clean_dataset/Dataset_cast_recent.csv' # cast_csv = 'clean_dataset/Dataset_cast_past.csv' cast_df = spark.read.option("encoding", "UTF-8").load(cast_csv, format="csv", sep=",", header="true", escape='"').cache() cast_df = cast_df.select("actor_1", "actor_2", "actor_3", "actor_4", "actor_5", "director", "id") print(cast_df.show(5)) # replace 'Unknown' string with 'null' string in casts and directors cast_df = cast_df.withColumn( 'actor_1', functions.when(cast_df['actor_1'] == 'Unknown', 'null').otherwise(cast_df['actor_1'])) cast_df = cast_df.withColumn( 'actor_2', functions.when(cast_df['actor_2'] == 'Unknown', 'null').otherwise(cast_df['actor_2'])) cast_df = cast_df.withColumn( 'actor_3', functions.when(cast_df['actor_3'] == 'Unknown', 'null').otherwise(cast_df['actor_3'])) cast_df = cast_df.withColumn( 'actor_4', functions.when(cast_df['actor_4'] == 'Unknown', 'null').otherwise(cast_df['actor_4'])) cast_df = cast_df.withColumn( 'actor_5', functions.when(cast_df['actor_5'] == 'Unknown', 'null').otherwise(cast_df['actor_5'])) cast_df = cast_df.withColumn( 'actor_5', functions.when(cast_df['actor_5'] == 'Unknown', 'null').otherwise(cast_df['actor_5'])) cast_df = cast_df.withColumn( 'director', functions.when(cast_df['director'] == 'Unknown', 'null').otherwise(cast_df['director'])) # drop all empty cells (null) from directors cast_df = cast_df.filter(cast_df['director'].isNotNull()) # drop all empty cells (null) from casts that have 5 empty cell in each actor cast_df = cast_df.filter((cast_df['actor_1'].isNotNull()) & (cast_df['actor_2'].isNotNull()) & (cast_df['actor_3'].isNotNull()) & (cast_df['actor_4'].isNotNull()) & (cast_df['actor_5'].isNotNull())) # drop all cell with 'null' or 'Unknown' string from director cast_df = cast_df.filter(cast_df['director'] != 'null') # drop all cell with 'null' or 'Unknown' string from all 5 casts cast_df = cast_df.filter((cast_df['actor_1'] != 'null') & (cast_df['actor_2'] != 'null') & (cast_df['actor_3'] != 'null') & (cast_df['actor_4'] != 'null') & (cast_df['actor_5'] != 'null')) ########### To get unique director for 'director' column ################### unique_director_df = get_unique_col('director', cast_df) list_top_unique_director = get_top_features('director', cast_df) ''' ########### To get unique cast for each individual Column ################## unique_cast1_df = get_unique_col('actor_1',cast_df) list_top_unique_cast1= get_top_features('actor_1',unique_cast1_df) print(unique_cast1_df.show()) print(list_top_unique_cast1) unique_cast2_df = get_unique_col('actor_2',cast_df) list_top_unique_cast2= get_top_features('actor_2',unique_cast2_df) print(unique_cast2_df.show()) print(list_top_unique_cast2) unique_cast3_df = get_unique_col('actor_3',cast_df) list_top_unique_cast3= get_top_features('actor_3',unique_cast3_df) print(unique_cast3_df.show()) print(list_top_unique_cast3) unique_cast4_df = get_unique_col('actor_4',cast_df) list_top_unique_cast4= get_top_features('actor_4',unique_cast4_df) print(unique_cast4_df.show()) print(list_top_unique_cast4) unique_cast5_df = get_unique_col('actor_5',cast_df) list_top_unique_cast5= get_top_features('actor_5',unique_cast5_df) print(unique_cast5_df.show()) print(list_top_unique_cast5) ''' ########### To get unique cast for combined cast Columns ################## cast_df = cast_df.withColumn( 'joined_column', functions.concat(functions.col('actor_1'), functions.lit(','), functions.col('actor_2'), functions.lit(','), functions.col('actor_3'), functions.lit(','), functions.col('actor_4'), functions.lit(','), functions.col('actor_5'))) cast_df = cast_df.withColumn( 'joined_col_temp', functions.split(functions.col('joined_column'), ',')) unique_cast_df = cast_df.select( functions.explode('joined_col_temp').alias('unique_cast')).groupby( 'unique_cast').count() unique_cast_df = unique_cast_df.withColumn( 'unique_cast', functions.trim(functions.col('unique_cast'))) unique_cast_df = unique_cast_df.groupby('unique_cast').agg({ 'count': 'sum' }).sort(functions.desc("sum(count)")) list_top_unique_casts = get_top_features('unique_cast', unique_cast_df) cast_df = cast_df.drop(cast_df.joined_column) cast_df = cast_df.drop(cast_df.joined_col_temp) cast_rdd = cast_df.rdd.map(list) ''' ######################################################################''' ''' #### 1st Function to random casts and 2nd function to pick top casts ###''' new_cast_rdd = cast_rdd.map(get_random_one_cast) # new_cast_rdd = cast_rdd.map(lambda j: get_top_cast(j, list_top_unique_casts)) ''' ######################################################################### ''' new_schema_cast_csv = StructType([ StructField('cast', StringType(), True), StructField('director', StringType(), True), StructField('id', StringType(), True) ]) new_cast_df = rdd_to_df(new_cast_rdd, new_schema_cast_csv) # outputs= 'new_clean_cast_dataset/single_cast_recent_top1000.csv' outputs = 'new_clean_cast_dataset/single_cast_recent_random.csv'
def transform_us_demo_dataset(self): df_us = spark.read.options(header='True', inferSchema='True', delimiter=';') \ .csv("../workspace/immigration_files/demo/us-cities-demographics.csv") df_us = df_us.withColumnRenamed("Median Age", "Median_Age") \ .withColumnRenamed("Male Population","Male_Population") \ .withColumnRenamed("Female Population", "Female_Population") \ .withColumnRenamed("Total Population", "Total_Population") \ .withColumnRenamed("Number of Veterans", "Number_of_Veterans") \ .withColumnRenamed("Foreign-born", "Foreign_born") \ .withColumnRenamed("Average Household Size", "Average_Household_Size") \ .withColumnRenamed("State Code", "State_Code") df_us = df_us.withColumn("City", trim(df_us.City)) \ .withColumn("State", trim(df_us.State)) \ .withColumn("Median_Age", trim(df_us.Median_Age)) \ .withColumn("Male_Population", trim(df_us.Male_Population)) \ .withColumn("Female_Population", trim(df_us.Female_Population)) \ .withColumn("Total_Population", trim(df_us.Total_Population)) \ .withColumn("Number_of_Veterans", trim(df_us.Number_of_Veterans)) \ .withColumn("Foreign_born", trim(df_us.Foreign_born)) \ .withColumn("Average_Household_Size", trim(df_us.Average_Household_Size)) \ .withColumn("State_Code", trim(df_us.State_Code)) \ .withColumn("Race", trim(df_us.Race)) \ .withColumn("Count", trim(df_us.Count)) df_us.write\ .csv(path = save_path + '/demo/',mode='overwrite', header=True)
# COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ----------
def main_movie_with_rating(): movie_csv = 'clean_dataset/Dataset_movie_with_rating_csv.csv' movie_df = spark.read.load(movie_csv, format="csv", sep=",", header="true", escape='"').cache() print(movie_df.show()) # Find mean of revenue in non-zero row movie_df_1 = movie_df.filter(movie_df['revenue'] != 0) mean_revenue = movie_df_1.groupBy().agg( functions.avg(movie_df_1['revenue'])).collect()[0][0] # Find mean of budget in non-zero row movie_df_1 = movie_df.filter(movie_df['budget'] != 0) mean_budget = movie_df_1.groupBy().agg(functions.avg( movie_df_1['budget'])).collect()[0][0] # Find mean of avg_rating in specified conditions as below id_rating_df = movie_df.select('id', 'avg_rating') movie_df_1 = id_rating_df.filter(id_rating_df['avg_rating'] != 'null') movie_df_1 = movie_df_1.withColumn( "avg_rating", movie_df_1["avg_rating"].cast(DoubleType())) movie_df_1 = movie_df_1.filter(movie_df_1['avg_rating'] < 6) mean_rating = movie_df_1.groupBy().agg( functions.avg(movie_df_1['avg_rating'])).collect()[0][0] # replace null with the mean of avg_rating movie_df = movie_df.withColumn( 'avg_rating', functions.when(movie_df['avg_rating'] != "", movie_df['avg_rating']).otherwise(mean_rating)) # replace 0 with the mean of revenue movie_df = movie_df.withColumn( 'revenue', functions.when(movie_df['revenue'] == 0, mean_revenue).otherwise(movie_df['revenue'])) # replace 0 with the mean of budget movie_df = movie_df.withColumn( 'budget', functions.when(movie_df['budget'] == 0, mean_budget).otherwise(movie_df['budget'])) # drop all empty cells (null) from release_date,production_companies,production_countries,genres movie_df = movie_df.filter(movie_df['release_date'].isNotNull()) movie_df = movie_df.filter(movie_df['production_companies'].isNotNull()) movie_df = movie_df.filter(movie_df['production_countries'].isNotNull()) movie_df = movie_df.filter(movie_df['genres'].isNotNull()) # drop all cell with 'null' string from release_date,production_companies,production_countries,genres movie_df = movie_df.filter((movie_df['release_date'] != 'null') & (movie_df['release_date'] != '[null]')) movie_df = movie_df.filter((movie_df['production_companies'] != 'null') & (movie_df['production_companies'] != '[null]')) movie_df = movie_df.filter((movie_df['production_countries'] != 'null') & (movie_df['production_countries'] != '[null]')) movie_df = movie_df.filter((movie_df['genres'] != 'null') & (movie_df['genres'] != '[null]')) movie_df = movie_df.withColumn( 'genres_temp', functions.split(functions.regexp_extract('genres', '\[(.*)\]', 1), ',')) unique_genre = movie_df.select( functions.explode('genres_temp').alias('unique_genre')).groupby( 'unique_genre').count() unique_genre = unique_genre.withColumn( 'unique_genre', functions.trim(functions.col('unique_genre'))) unique_genre = unique_genre.withColumn("unique_genre", blank_as_str_null("unique_genre")) unique_genre = unique_genre.groupby('unique_genre').agg({ 'count': 'sum' }).sort(functions.desc("sum(count)")).cache() list_top_unique_genre = get_top_features('unique_genre', unique_genre) movie_df = movie_df.withColumn( 'production_companies_temp', functions.split( functions.regexp_extract('production_companies', '\[(.*)\]', 1), ',')) unique_production_companies = movie_df.select( functions.explode('production_companies_temp').alias( 'unique_production_companies')).groupby( 'unique_production_companies').count() unique_production_companies = unique_production_companies.withColumn( 'unique_production_companies', functions.trim(functions.col('unique_production_companies'))) unique_production_companies = unique_production_companies.withColumn( "unique_production_companies", blank_as_str_null("unique_production_companies")) unique_production_companies = unique_production_companies.groupby( 'unique_production_companies').agg({ 'count': 'sum' }).sort(functions.desc("sum(count)")).cache() list_top_unique_production_companies = get_top_features( 'unique_production_companies', unique_production_companies) movie_df = movie_df.withColumn( 'production_countries_temp', functions.split( functions.regexp_extract('production_countries', '\[(.*)\]', 1), ',')) unique_production_countries = movie_df.select( functions.explode('production_countries_temp').alias( 'unique_production_countries')).groupby( 'unique_production_countries').count() unique_production_countries = unique_production_countries.withColumn( 'unique_production_countries', functions.trim(functions.col('unique_production_countries'))) unique_production_countries = unique_production_countries.withColumn( "unique_production_countries", blank_as_str_null("unique_production_countries")) unique_production_countries = unique_production_countries.groupby( 'unique_production_countries').agg({ 'count': 'sum' }).sort(functions.desc("sum(count)")).cache() list_top_unique_production_countries = get_top_features( 'unique_production_countries', unique_production_countries) movie_df = movie_df.drop(movie_df.production_companies_temp).drop( movie_df.production_countries_temp).drop(movie_df.genres_temp) movie_rdd = movie_df.rdd.map(list) list_top_movie_features = [ list_top_unique_production_companies, list_top_unique_production_countries, list_top_unique_genre ] '''#######################################################################################################''' '''#### 1st Function to random production_companies and 2nd function to pick top production_companies ####''' # new_movie_rdd = movie_rdd.map(lambda j: get_random_movie_features(j)) new_movie_rdd = movie_rdd.map( lambda j: get_top_movie_features(j, list_top_movie_features)) '''####################################################################################################### ''' new_schema_movie_csv = StructType([ StructField('id', StringType(), True), StructField('title', StringType(), True), StructField('production_companies', StringType(), True), StructField('production_countries', StringType(), True), StructField('genres', StringType(), True), StructField('release_date', StringType(), True), StructField('revenue', StringType(), True), StructField('budget', StringType(), True), StructField('avg_rating', StringType(), True) ]) new_movie_df = rdd_to_df(new_movie_rdd, new_schema_movie_csv) print(new_movie_df.show()) meta_movie_csv = 'dataset/movies_metadata.csv' meta_movie_df = spark.read.load(meta_movie_csv, format="csv", sep=",", header="true", escape='"').cache() meta_movie_df = meta_movie_df.select("id", "runtime").withColumnRenamed( 'id', 'id_meta') meta_movie_df = meta_movie_df.withColumn( 'runtime_temp', meta_movie_df.runtime.cast( DoubleType())).drop('runtime').withColumnRenamed( 'runtime_temp', 'runtime').cache() joined_movie_df = new_movie_df.join( meta_movie_df, new_movie_df.id == meta_movie_df.id_meta).drop( meta_movie_df.id_meta).cache() temp_joined_movie_df = joined_movie_df.filter( joined_movie_df['runtime'].isNotNull()) joined_movie_df = joined_movie_df.withColumn( "temp", functions.when(functions.col('runtime').isNotNull(), col('runtime')).otherwise(None)).drop('temp').cache() joined_movie_df = joined_movie_df.fillna(1000, subset=['runtime']) mean_runtime = temp_joined_movie_df.groupBy().agg( functions.avg(temp_joined_movie_df['runtime'])).collect()[0][0] joined_movie_df = joined_movie_df.withColumn( 'runtime', functions.when(joined_movie_df['runtime'] == 1000, mean_runtime).otherwise(joined_movie_df['runtime'])) print(joined_movie_df.show()) outputs = 'new_clean_dataset/clean_top_movie_with_rating_csv.csv' outputs = 'new_clean_dataset/clean_random_movie_with_rating_csv.csv'