Ejemplo n.º 1
0
def removePunctuation(column):

    no_punct = regexp_replace(column, "\p{Punct}", '')
    lowered = lower(no_punct)
    cleaned = trim(lowered)
    return cleaned

    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.
    Args:
        column (Column): A Column containing a sentence.
    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
Ejemplo n.º 3
0
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    #column_val = regexp_replace(column, "\p{Punct}", "")
    #return trim(lower(column_val))
    word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word")
    return word
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    
#     assert(isinstance(column, pyspark.sql.column.Column))
    assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>")    
    
    columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "")
#     columnNoPunct = regexp_replace(column, string.punctuation, "")    
    columnLowerCase = lower(columnNoPunct)
    columnTrimmed = trim(columnLowerCase)
    
    return columnTrimmed
Ejemplo n.º 5
0
 def colTrim(columns):
     exprs = [trim(col(c)).alias(c)
              if (c in columns) and (c in validCols)
              else c
              for (c, t) in self.__df.dtypes]
     self.__df = self.__df.select(*exprs)
Ejemplo n.º 6
0
]))
id_pdv = id_pdv.dropDuplicates()

# Export
id_pdv.write.parquet("./id_pdv_dam")

# Transactions ====
df = spark.table('x_compensation.transactions')

df = (df.filter(fn.col("b21_code_pays_du_systeme_dacceptation") == 250).filter(
    fn.col("s04_code_operation") == 100).select(
        fn.col("b14_siret").alias("SRT"),
        fn.col("b08_environnement_reglementaire__technique_de_la_transaction").
        alias("ERT"),
        fn.col("b15_code_activite_de_laccepteur___code_mcc").alias("MCC"),
        fn.trim(fn.col("b17_libelle_enseigne_commerciale")).alias("RSN"),
        fn.trim(
            fn.col("b16_numero_de_contrat_accepteur")).alias("ID_PDV_BQE")))
# fn.col("s06_identifiant_etablissement_donneur_dordre").alias("REF_ACQ")))

# Merged
dfMerged = df.join(ert, df["ERT"] == ert["ert"], "left").drop(ert["ert"])

# Cleaning
# Contrat must have length == 7
dfMerged = (dfMerged.withColumn(
    "ID_PDV_BQE",
    fn.when(
        fn.length(fn.col("ID_PDV_BQE")) == 10,
        fn.substring(fn.col("ID_PDV_BQE"), 3,
                     7)).otherwise(fn.col("ID_PDV_BQE"))))
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

names = spark.read.schema(schema).option(
    "sep", " ").csv(f"{SPARK_DATA_PATH}/Marvel-names.txt")

lines = spark.read.text(f"{SPARK_DATA_PATH}/Marvel-graph.txt")

# Small tweak vs. what's shown in the video: we trim each line of whitespace as that could
# throw off the counts.
connections = lines.withColumn(
    "id",
    func.split(func.trim(func.col("value")), " ")[0]).withColumn(
        "connections",
        func.size(func.split(func.trim(func.col("value")), " ")) -
        1).groupBy("id").agg(func.sum("connections").alias("connections"))

# Show the minimum number of connections
minConnections = connections.agg(
    func.min('connections').alias('min_connections'))
print(
    f'Minimum number of connections is {minConnections.first().min_connections}'
)

# Show all superheroes with 1 connection
connections = connections.filter(func.col('connections') == 1)
connections = connections.join(names, 'id').select('name', 'connections')
connections.show()
Ejemplo n.º 8
0
    def transform(self, sources: dict) -> DataFrame:
        ri = self.invoice_dataframe(sources['rptt_invoice'])

        rst = self.read_source(source=sources['rptm_sbu_subset_txt'])

        cmf = self.read_source(source=sources['customer_mapping'])
        cmf = cmf.withColumnRenamed('sales_rep', 'sales_rep_override')
        cmf = cmf.withColumnRenamed('sales_rep_id', 'sales_rep_id_override')
        cmf = cmf.withColumnRenamed('end_market', 'cmf_end_market')

        mmf = self.read_source(source=sources['material_mapping'])
        srtr = self.read_source(source=sources['sales_rep_to_region'])

        rsrt = self.read_source(source=sources['rptm_sales_rep_txt'])
        rsrt = rsrt.withColumnRenamed('med_desc', 'sales_rep_original')

        edataA = self.read_source(source=sources['exclusion_dataA'])
        edataA = edataA.withColumnRenamed('sold_customer_id',
                                          'edataA_sold_customer_id')

        # Source contains system_id/material_id pairs that need excluded
        excmat = self.read_source(source=sources['exclude_mat'])

        cerd = self.read_source(source=sources['currency_exchange_rates'])
        cerd = fixCurExchangeToAvg(self, cerd)
        cers = cerd.select('currency_code_from', 'cur_year', 'cur_month',
                           'conversion_rate_multiplier')
        cers = cers.withColumnRenamed('currency_code_from',
                                      'std_currency_code_from')
        cers = cers.withColumnRenamed('cur_year', 'std_cur_year')
        cers = cers.withColumnRenamed('cur_month', 'std_cur_month')
        cers = cers.withColumnRenamed('conversion_rate_multiplier',
                                      'std_conversion_rate_multiplier')

        dcust_sold = self.read_source(source=sources['dim_customer'])
        dcust_sold = dcust_sold.withColumnRenamed('dim_customer_id',
                                                  'sold_dim_customer_id')

        dcust_ship = self.read_source(source=sources['dim_customer'])
        dcust_ship = dcust_ship.withColumnRenamed('dim_customer_id',
                                                  'ship_dim_customer_id')

        dcust_brand = self.read_source(source=sources['dim_customer'])
        dcust_brand = dcust_brand.withColumnRenamed('dim_customer_id',
                                                    'brand_dim_customer_id')

        dloc_ship = self.read_source(source=sources['dim_location'])
        dloc_ship = dloc_ship.withColumnRenamed('dim_location_id',
                                                'ship_from_dim_location_id')

        dloc_inv = self.read_source(source=sources['dim_location'])
        dloc_inv = dloc_inv.withColumnRenamed('dim_location_id',
                                              'invoice_dim_location_id')

        dmat = self.read_source(source=sources['dim_material'])
        dmat = dmat.withColumnRenamed('dim_material_id',
                                      'ship1_dim_material_id')

        df = (ri.join(excmat, [
            excmat.material_id == ri.ship1_material_id_int, excmat.system
            == ri.system_id
        ], 'left_anti').join(
            rst, [rst.sbu_subset_id == ri.sbu_subset_id], 'left_outer').join(
                mmf, [mmf.material == ri.mmf_material],
                'left_outer').join(dmat, [
                    dmat.billing_system == ri.system_id, dmat.material_id
                    == ri.ship_mat1_id, dmat.end_market_or_prime == F.when(
                        ri.prime_flag == 1, 'Prime').otherwise('Non-Prime')
                ], 'left_outer').join(cmf, [
                    F.upper(F.trim(cmf.sold_to_ship_to))
                    == ri.commercial_print_customer_key,
                    F.upper(F.trim(cmf.cmf_end_market)) == F.upper(
                        dmat.end_market)
                ], 'left_outer').join(srtr, [
                    srtr.sales_rep_id == cmf.sales_rep_id_override
                ], 'left_outer').join(cerd, [
                    cerd.currency_code_from == ri.currency_id, cerd.cur_year
                    == ri.inv_year, cerd.cur_month == ri.inv_month
                ], 'left_outer').join(cers, [
                    cers.std_currency_code_from == ri.std_cost_currency_id,
                    cers.std_cur_year == ri.inv_year, cers.std_cur_month
                    == ri.inv_month
                ], 'left_outer').join(dcust_sold, [
                    dcust_sold.billing_system == ri.system_id,
                    dcust_sold.customer_id == ri.sold_customer_id
                ], 'left_outer').join(dcust_ship, [
                    dcust_ship.billing_system == ri.system_id,
                    dcust_ship.customer_id == ri.ship_customer_id
                ], 'left_outer').join(dcust_brand, [
                    dcust_brand.billing_system == ri.system_id,
                    dcust_brand.customer_id == ri.brand_owner
                ], 'left_outer').join(dloc_ship, [
                    dloc_ship.location_id == ri.ship_location_id
                ], 'left_outer').join(
                    dloc_inv, [dloc_inv.location_id == ri.mfg_location_id],
                    'left_outer').join(edataA, [
                        edataA.edataA_sold_customer_id
                        == ri.sold_customer_id_lstrip_0, ri.system_id == 'S3',
                        ri.rev_acct_id == 'R6000'
                    ], 'left_anti').join(
                        rsrt, [rsrt.sales_rep_id == ri.ri_sales_rep_id],
                        'left_outer').select(
                            ri.system_id, ri.invoice_id, ri.line_number,
                            ri.month, ri.source_type, ri.rev_acct_id,
                            ri.weight_qty, ri.currency_id,
                            ri.std_cost_currency_id, ri.inv_date,
                            ri.quality_class, ri.sale_type,
                            ri.invoice_line_value, ri.line_qty,
                            ri.invoice_uom_id, ri.inv_line_std_cost, ri.period,
                            ri.year, ri.sales_order, ri.ri_sales_rep_id,
                            ri.line_desc1, rst.med_desc, mmf.cp_subset,
                            cmf.channel, cmf.drop_ship_into_stock,
                            cmf.sales_rep_override, cmf.cmf_end_market,
                            cmf.sales_rep_id_override,
                            cerd.conversion_rate_multiplier,
                            cers.std_conversion_rate_multiplier,
                            dmat.ship1_dim_material_id, dmat.product_code,
                            dmat.force_product_code, dmat.nominal_basis_weight,
                            dmat.material_id, dmat.end_market,
                            dloc_ship.ship_from_dim_location_id,
                            dloc_inv.invoice_dim_location_id,
                            dcust_ship.ship_dim_customer_id,
                            dcust_sold.sold_dim_customer_id,
                            dcust_brand.brand_dim_customer_id,
                            rsrt.sales_rep_original, srtr.region,
                            ri.invoice_volume))

        df = df.where(
            "case when system_id = 'S3' then product_code else '~' end not in ('SC', 'CR')"
        )

        df = df.withColumn('iptmeta_source_system', F.lit('dataA'))
        df = df.withColumn('bol_number', F.lit(MISSING_NUMBER))

        df = df.withColumn(
            'product_sold_flag',
            F.when((df.weight_qty.isNull()) | (df.weight_qty == 0),
                   F.lit('N')).otherwise(F.lit('Y')))

        df = df.withColumn(
            'fx_conversion_to_usd',
            F.coalesce(
                F.when(df.currency_id == 'USD', 1).otherwise(
                    df.conversion_rate_multiplier.cast(T.DoubleType())),
                F.lit(MISSING_NUMBER)))

        df = df.withColumn(
            'std_fx_conversion_to_usd',
            F.coalesce(
                F.when(df.std_cost_currency_id == 'USD', 1).otherwise(
                    df.std_conversion_rate_multiplier.cast(T.DoubleType())),
                F.lit(MISSING_NUMBER)))

        df = df.withColumn('grade', df.product_code)

        df = df.withColumn('invoice_date', F.to_date(df.inv_date))

        df = prime_enrich(df)

        df = df.withColumn('sales_order_number',
                           F.coalesce(df.sales_order, F.lit('0')))

        df = df.withColumn(
            'sale_type',
            F.when(df.sale_type == 'I', F.lit('Internal')).when(
                df.sale_type == 'E',
                F.lit('External')).otherwise(df.sale_type))

        df = df.withColumn(
            'subset',
            F.coalesce(df.cp_subset, df.med_desc, F.lit(NOT_APPLICABLE_DESC)))

        df = (
            df.withColumn(
                'claims',
                F.when(df.rev_acct_id.isin('R4900', 'R4350'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'discounts',
                    F.when(df.rev_acct_id.isin('R4500'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn("freight_invoice_calc", F.lit('actual')).withColumn(
                'freight_invoice',
                F.when(df.rev_acct_id.isin('R8200'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'freight_upcharge',
                    F.when(df.rev_acct_id.isin('R0300'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn(
                'gross_price',
                F.when(df.rev_acct_id.isin('R0100', 'R0500', 'R0700', 'R0105'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'other_deductions',
                    F.when(df.rev_acct_id.isin('R5300'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn(
                'standard_cost',
                F.coalesce(
                    df.inv_line_std_cost *
                    df.std_fx_conversion_to_usd,
                    F.lit(MISSING_NUMBER))).withColumn(
                        'rebates',
                        F.when(
                            df.rev_acct_id.isin(
                                'R4110', 'R4130'), df.invoice_line_value *
                            df.fx_conversion_to_usd).otherwise(MISSING_NUMBER))
            # TODO Confirm exclusions and/or data predicate should be here
            .withColumn(
                'service_allowances',
                F.when(df.rev_acct_id.isin('R6000'), df.invoice_line_value *
                       df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)))

        df = df.withColumn(
            'msf',
            F.when(df.invoice_uom_id == 'MSF',
                   df.line_qty).when(df.invoice_uom_id == 'M2',
                                     df.line_qty * .0107639).otherwise(0))

        df = df.withColumn('nominal_tons',
                           df.nominal_basis_weight * df.msf / 2000)

        df = df.withColumn(
            'net_price',
            df.gross_price + df.discounts + df.rebates + df.claims +
            df.freight_upcharge + df.other_deductions + df.service_allowances)

        df = df.withColumn(
            'standard_gross_margin',
            df.net_price - (df.standard_cost + df.freight_invoice))

        df = dataA_sales_rep_override(df)
        df = df.withColumn(
            'sales_rep_id',
            F.coalesce(df.sales_rep_id_override, df.ri_sales_rep_id,
                       F.lit(MISSING_NUMBER)))

        df = (
            df.withColumn(
                'ship_from_dim_location_id',
                F.coalesce(
                    df.ship_from_dim_location_id,
                    F.lit(MISSING_STRING_ID))).withColumn(
                        'invoice_dim_location_id',
                        F.coalesce(
                            df.invoice_dim_location_id,
                            F.lit(MISSING_STRING_ID))).withColumn(
                                'ship1_dim_material_id',
                                F.coalesce(
                                    df.ship1_dim_material_id,
                                    F.lit(MISSING_STRING_ID))).withColumn(
                                        'channel',
                                        F.coalesce(
                                            df.channel,
                                            F.lit(MISSING_DESC))).withColumn(
                                                'drop_ship_into_stock',
                                                F.coalesce(
                                                    df.drop_ship_into_stock,
                                                    F.lit(MISSING_DESC))).
            withColumn('region', F.coalesce(
                df.region, F.lit(MISSING_DESC))).withColumn(
                    'ship_dim_customer_id',
                    F.coalesce(
                        df.ship_dim_customer_id,
                        F.lit(MISSING_STRING_ID))).withColumn(
                            'sold_dim_customer_id',
                            F.coalesce(
                                df.sold_dim_customer_id,
                                F.lit(MISSING_STRING_ID))).withColumn(
                                    'brand_dim_customer_id',
                                    F.coalesce(
                                        df.brand_dim_customer_id,
                                        F.lit(MISSING_STRING_ID))).withColumn(
                                            'invoice_period',
                                            F.lpad(df.month, 6, '0')))

        df = (df.withColumnRenamed(
            'system_id', 'billing_system').withColumnRenamed(
                'rev_acct_id', 'invoice_line_code').withColumnRenamed(
                    'invoice_id', 'invoice_number').withColumnRenamed(
                        'line_number',
                        'invoice_line_number').withColumnRenamed(
                            'source_type',
                            'invoice_source_type').withColumnRenamed(
                                'channel',
                                'commercial_print_channel').withColumnRenamed(
                                    'drop_ship_into_stock',
                                    'commercial_print_mode').withColumnRenamed(
                                        'region', 'commercial_print_region').
              withColumnRenamed('currency_id',
                                'invoiced_currency').withColumnRenamed(
                                    'weight_qty',
                                    'actual_tons').withColumnRenamed(
                                        'period',
                                        'report_month').withColumnRenamed(
                                            'year',
                                            'report_year').withColumnRenamed(
                                                'line_desc1',
                                                'invoice_line_desc_1'))

        df = df.select(
            df.billing_system, df.invoice_number, df.invoice_line_number,
            df.invoice_period, df.invoice_source_type, df.invoice_line_code,
            df.iptmeta_source_system, df.product_sold_flag,
            df.commercial_print_channel, df.commercial_print_mode,
            df.fx_conversion_to_usd, df.grade, df.invoice_date,
            df.ship_from_dim_location_id, df.invoiced_currency,
            df.ship1_dim_material_id, df.prime, df.sales_order_number,
            df.sale_type, df.sales_representative, df.ship_dim_customer_id,
            df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset,
            df.actual_tons, df.claims, df.discounts, df.freight_invoice,
            df.freight_invoice_calc, df.freight_upcharge, df.gross_price,
            df.msf, df.net_price, df.nominal_tons, df.other_deductions,
            df.rebates, df.service_allowances, df.standard_cost,
            df.standard_gross_margin, df.invoice_dim_location_id,
            df.commercial_print_region, df.invoice_volume, df.invoice_uom_id,
            df.bol_number, df.report_month, df.report_year, df.sales_rep_id,
            df.invoice_line_desc_1)

        return df
Ejemplo n.º 9
0
 def extract_client_df(self):
     client_dictionary = {
         'card_dim':
         self.client.customers().data.select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             'card_id', 'card_birth_date', 'card_termination_date',
             'card_address_valid_flag', 'card_address_country_code',
             'card_analyse_now_suppress_flag', 'card_suppress_flag'),
         'desc_dt':
         self.sqlContext.
         table('market_x_datalake.7_market_smartclub_members_c').select(
             F.concat(F.lit('0'), F.col('member_id')).alias("mem_id"),
             'deceased_date').filter((F.col('deceased_date') == '00000000')
                                     | (F.col('deceased_date').isNull())),
         'prod_dim':
         self.client.products().data.select('prod_code',
                                            'prod_hier_l20_code',
                                            'prod_desc'),
         'store_dim':
         self.client.stores().data.select('banner_name', 'store_code'),
         'dur_period':
         self.client.items(fisWeekId=self.config_dict['event_end'],
                           weeks=self.config_dict['event_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             'prod_code',
             col('net_spend_amt').cast(IntegerType()).alias('spend'),
             'prod_id',
             col('transaction_code').alias('transaction_fid'),
             'transaction_dttm', 'fis_week_id').filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & (trim(col('prod_code')) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['event_start']),
                     str(self.config_dict['event_end']))),
         'pre_period':
         self.client.items(fisWeekId=self.config_dict['pre_end'],
                           weeks=self.config_dict['pre_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             'prod_code',
             col('transaction_code').alias('transaction_fid'), 'prod_id',
             col('net_spend_amt').cast(IntegerType()).alias('spend'),
             'fis_week_id').filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & (trim(col('prod_code')) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['pre_start']),
                     str(self.config_dict['pre_end']))),
         'post_period':
         self.client.items(fisWeekId=self.config_dict['post_end'],
                           weeks=self.config_dict['post_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             'prod_code',
             col('transaction_code').alias('transaction_fid'), 'prod_id',
             col('net_spend_amt').cast(IntegerType()).alias('spend'),
             'fis_week_id').filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & (trim(col('prod_code')) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['post_start']),
                     str(self.config_dict['post_end']))),
         'dur_period_basket':
         self.client.baskets(fisWeekId=self.config_dict['event_end'],
                             weeks=self.config_dict['event_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             col('basket_spend_amt').cast(IntegerType()).alias('spend'),
             col('basket_item_qty').cast(
                 IntegerType()).alias('basket_item_qty'),
             col('basket_item_qty').alias('item'),
             col('transaction_code').alias('transaction_fid'),
             'transaction_dttm', 'fis_week_id').filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['event_start']),
                     str(self.config_dict['event_end']))),
         'pre_period_basket':
         self.client.baskets(fisWeekId=self.config_dict['pre_end'],
                             weeks=self.config_dict['pre_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             col('transaction_code').alias('transaction_fid'),
             col('basket_spend_amt').cast(IntegerType()).alias('spend'),
             col('basket_item_qty').cast(
                 IntegerType()).alias('basket_item_qty'),
             col('basket_item_qty').alias('item'), 'fis_week_id').filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['pre_start']),
                     str(self.config_dict['pre_end']))),
         'post_period_basket':
         self.client.baskets(fisWeekId=self.config_dict['post_end'],
                             weeks=self.config_dict['post_weeks']).data.
         select(
             col('card_code').alias(self.config_dict['identity_type_code']),
             col('transaction_code').alias('transaction_fid'),
             col('basket_spend_amt').cast(IntegerType()).alias('spend'),
             col('basket_item_qty').cast(
                 IntegerType()).alias('basket_item_qty'), 'fis_week_id',
             col('basket_item_qty').alias('item')).filter(
                 col(self.config_dict['identity_type_code']).isNotNull()
                 & (trim(col(self.config_dict['identity_type_code'])) != '')
                 & col('fis_week_id').between(
                     str(self.config_dict['post_start']),
                     str(self.config_dict['post_end']))),
         'date_dim':
         self.client.calendar().data
     }
     BaseModule._dict.update(client_dictionary)
    .option("inferSchema", "false")
    .schema(tasteprofile_schema)
    .load("hdfs:///data/msd/tasteprofile/triplets.tsv")
)

tasteprofile.show(20, False)

# Load in the mismatched data.
mismatches_text = (
    spark.read.format("text")
    .load('hdfs:///data/msd/tasteprofile/mismatches/sid_mismatches.txt')
)

# Parse the fixed width text data to format it.
mismatches = mismatches_text.select(
    F.trim(F.col('value').substr(9, 18)).alias('Song_ID').cast(StringType()),
    F.trim(F.col('value').substr(28, 18)).alias('Track_ID').cast(StringType())
)

mismatches.show(20, False)

# Load in the accepted mismatched data.
mismatches_accepted_text = (
    spark.read.format("text")
    .load('hdfs:///data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt')
)

# Parse the fixed width text data to format it.
mismatches_accepted = mismatches_accepted_text.select(
    F.trim(F.col('value').substr(11, 18)).alias('Song_ID').cast(StringType()),
    F.trim(F.col('value').substr(30, 18)).alias('Track_ID').cast(StringType())
Ejemplo n.º 11
0
def remove_multiple_spaces(col: Column):
    """Replace multiple spaces with single spaces"""
    return F.trim(F.regexp_replace(col, " +", " "))
Ejemplo n.º 12
0
        upper(col('Description'))).show(2)

df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
#   trim('      HELLO       '),
#   lpad('HELLO', 3, ' '),
Ejemplo n.º 13
0
from pyspark.sql.functions import upper, col, regexp_extract, regexp_replace

#-----------------------------------
#DOWNTOWN
#-----------------------------------

dt = '/Users/valerieangulo/Downtown/fuzzymatching/songtrust_match.csv'
dtdf = sqlContext.read.format("com.databricks.spark.csv").option(
    "header", "true").option("inferSchema", "true").load(dt)
mydt = dtdf
mydt = dtdf[['Custom ID', 'Title', 'COMPOSER']]
#mydt = mydt.withColumn('ratio', F.lit(0)) #dont need if we join DFs with levenshtein

sortdt = mydt
#trim whitespaces
sortdt = sortdt.withColumn('Title', F.trim(sortdt.Title))
sortdt = sortdt.withColumn('COMPOSER', F.trim(sortdt.COMPOSER))
#DT tabs
sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '[\t]+', ''))
sortdt = sortdt.withColumn("COMPOSER",
                           regexp_replace(col("COMPOSER"), '[\t]+', ''))
#DT new lines
sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '[\n]+', ''))
sortdt = sortdt.withColumn("COMPOSER",
                           regexp_replace(col("COMPOSER"), '[\n]+', ''))
#make caps
sortdt = sortdt.withColumn('Title', F.upper(
    col('Title')))  #test(F.upper(['Title', 'COMPOSER']))
sortdt = sortdt.withColumn('COMPOSER', F.upper(col('COMPOSER')))
#remove quotes, unknowns, |, -
sortdt = sortdt.withColumn("Title", regexp_replace(col("Title"), '"', ''))
Ejemplo n.º 14
0
# Databricks notebook source
# CLASIFICACION BINARIA (0: vuelo en hora, 1: vuelo retrasado)

# COMMAND ----------

#PARTE1: DEFINIMOS LOS TIPOS DE DATOS DEL ARCHIVO TRAIN 
#Leemos el archivo csv con las cabeceras, importandolo primero todo como string
df = sqlContext.read.format("csv").option("header", "true").load("dbfs:/dataset/datos_preprocesados.csv")

from pyspark.sql.types import *
from pyspark.sql.functions import trim, col

df=df.select(trim(col("MONTH")).cast(IntegerType()).alias("MONTH"),
                trim(col("HOLIDAYS")).cast(IntegerType()).alias("HOLIDAYS"),
                trim(col("DAY_OF_MONTH")).cast(IntegerType()).alias("DAY_OF_MONTH"),
                trim(col("DAY_OF_WEEK")).cast(IntegerType()).alias("DAY_OF_WEEK"),
                trim(col("UNIQUE_CARRIER")).cast(IntegerType()).alias("UNIQUE_CARRIER"),
                trim(col("TAIL_NUM")).cast(IntegerType()).alias("TAIL_NUM"),
                trim(col("FL_NUM")).cast(IntegerType()).alias("FL_NUM"),
                trim(col("ORIGIN_AIRPORT_ID")).cast(IntegerType()).alias("ORIGIN_AIRPORT_ID"),
                trim(col("ORIGIN_CITY_MARKET_ID")).cast(IntegerType()).alias("ORIGIN_CITY_MARKET_ID"),
                trim(col("ORIGIN_STATE_NM")).cast(IntegerType()).alias("ORIGIN_STATE_NM"),
                trim(col("DEST_AIRPORT_ID")).cast(IntegerType()).alias("DEST_AIRPORT_ID"),
                trim(col("DEST_CITY_MARKET_ID")).cast(IntegerType()).alias("DEST_CITY_MARKET_ID"),
                trim(col("DEST_STATE_NM")).cast(IntegerType()).alias("DEST_STATE_NM"),
                trim(col("CRS_DEP_TIME")).cast(IntegerType()).alias("CRS_DEP_TIME"),
                trim(col("DEP_TIME")).cast(IntegerType()).alias("DEP_TIME"),
                trim(col("DEP_DELAY")).cast(IntegerType()).alias("DEP_DELAY"),
                trim(col("DEP_DELAY_NEW")).cast(IntegerType()).alias("DEP_DELAY_NEW"),
                trim(col("DEP_DEL15")).cast(IntegerType()).alias("DEP_DEL15"),
                trim(col("DEP_DELAY_GROUP")).cast(IntegerType()).alias("DEP_DELAY_GROUP"),
 def to_null(c):
     return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")),
                 col(c))
Ejemplo n.º 16
0
 def _trim(col_name, args):
     return F.trim(F.col(col_name))
Ejemplo n.º 17
0
import os
from pyspark.sql.functions import udf, trim, lower
from pyspark.sql.types import StringType

s3 = "s3a://shwes3udacapstone/"
DEMOGRAPHICS_DATA_PATH = "data/raw/demographics/us-cities-demographics.csv"
input_log_data_file = os.path.join(s3, DEMOGRAPHICS_DATA_PATH)
udf_capitalize_lower = udf(lambda x: str(x).lower().capitalize(), StringType())

df_demo = spark.read.format("csv").option("delimiter", ";").option(
    "header", "true").option("encoding", "UTF-8").load(input_log_data_file)
df_demo = df_demo.withColumnRenamed("State Code",
                                    "state_code").withColumnRenamed(
                                        "Median Age",
                                        "median_age").withColumnRenamed(
                                            "City", "city").withColumnRenamed(
                                                "Total Population",
                                                "population")
df_demo = df_demo.select("city", "state_code", "median_age", "population")
df_state = spark.read.parquet(s3 + "data/processed/codes/us_state")
df_demo = df_demo.join(df_state, ["state_code"])
df_demo = df_demo.withColumn("city", lower(trim(df_demo.city)))
df_demo.write.mode("overwrite").parquet(s3 + 'data/processed/city/')
Ejemplo n.º 18
0
from pyspark.sql.functions import trim
import pandas as pd
import cdsw

#initalize Spark Session 
spark = SparkSession.builder \
      .appName("Telco Customer Churn SVM") \
      .config('spark.shuffle.service.enabled',"True") \
      .getOrCreate()

#Define Dataframe Schema     
schemaData = StructType([StructField("state", StringType(), True),StructField("account_length", DoubleType(), True),StructField("area_code", StringType(), True),StructField("phone_number", StringType(), True),StructField("intl_plan", StringType(), True),StructField("voice_mail_plan", StringType(), True),StructField("number_vmail_messages", DoubleType(), True),     StructField("total_day_minutes", DoubleType(), True),     StructField("total_day_calls", DoubleType(), True),     StructField("total_day_charge", DoubleType(), True),     StructField("total_eve_minutes", DoubleType(), True),     StructField("total_eve_calls", DoubleType(), True),     StructField("total_eve_charge", DoubleType(), True),     StructField("total_night_minutes", DoubleType(), True),     StructField("total_night_calls", DoubleType(), True),     StructField("total_night_charge", DoubleType(), True),     StructField("total_intl_minutes", DoubleType(), True),     StructField("total_intl_calls", DoubleType(), True),     StructField("total_intl_charge", DoubleType(), True),     StructField("number_customer_service_calls", DoubleType(), True),     StructField("churned", StringType(), True)])

#Build Dataframe from File
raw_data = spark.read.schema(schemaData).csv('/tmp/churn.all')
churn_data=raw_data.withColumn("intl_plan",trim(raw_data.intl_plan))

reduced_numeric_cols = ["account_length", "number_vmail_messages",
                        "total_day_charge", "total_eve_charge",
                        "total_night_charge", "total_intl_calls", 
                        "total_intl_charge","number_customer_service_calls"]

reduced_numeric_cols1 = ["account_length", "number_vmail_messages", "total_day_calls",
                        "total_day_charge", "total_eve_calls", "total_eve_charge",
                        "total_night_calls", "total_night_charge", "total_intl_calls", 
                        "total_intl_charge","number_customer_service_calls"]

#Review DataSet Balance 
churn_data.registerTempTable("ChurnData")
sqlResult = spark.sql("SELECT churned, COUNT(churned) as Churned FROM ChurnData group by churned")
sqlResult.show()
Ejemplo n.º 19
0
def trim(col: Column):
    """Trim a string column"""
    return F.trim(col)
Ejemplo n.º 20
0
def format_name_column(column_name) -> Column:
    return trim(col(column_name))
Ejemplo n.º 21
0
def trim_leading_zeros(col: Column):
    """Trim the leading zeros from a string column"""
    return F.trim(F.regexp_replace(col, "^0*", " "))
Ejemplo n.º 22
0
def get_transformed_edges(graph_specification,
                          spark_config,
                          input_edge_path,
                          input_source_col,
                          input_target_col,
                          output_source_col,
                          output_target_col,
                          output_tag_col,
                          data_format='parquet',
                          array_delimiter=';',
                          max_result_size=1e9):
    """
    A generator that returns a Panda data frame of each processed edge
    in the graph specification

    :param graph_specification: Graph specification.
    :type graph_specification: fncore.utils.graph_specification.GraphSpec
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param input_edge_path: Path to input edge files for this graph.
    :type input_edge_path: str
    :param output_source_col: Column name to use for source id.
    :type output_source_col: str
    :param output_target_col: Column name to use for target id.
    :type output_target_col: str
    :param output_tag_col: Column name to use for node tag.
    :type output_tag_col: str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str
    :param array_delimiter: Delimiter used to separate items in array
    :type array_delimiter: str
    :param max_result_size: Maximum result size that spark driver accept
    :type max_result_size: int
    """

    for edge_kind in graph_specification.edge_lists:
        with get_spark_context(spark_config.create()) as spark_context:
            sql_context = SQLContext(spark_context)

            data = (sql_context.read.format(data_format).option(
                'header', 'true').option('inferschema', 'true').load(
                    os.path.join(input_edge_path, edge_kind.safe_name)))

            edge_kind_columns = (
                edge_kind.metadata_columns + [edge_kind.source_column] +
                [edge_kind.target_column] +
                ([edge_kind.index_column] if edge_kind.index_column else []) +
                ([edge_kind.weight_column] if edge_kind.weight_column else []))

            transformed = data

            # Drops duplicates (if index column does not exist)
            # TODO: Support multi field index in the future
            if not edge_kind.index_column:
                dedup_columns = ([edge_kind.source_column.safe_name] +
                                 [edge_kind.target_column.safe_name])
                transformed = transformed.dropDuplicates(subset=dedup_columns)

            for column in edge_kind_columns:
                transformed = transformed.withColumnRenamed(
                    column.safe_name, column.friendly_name or column.name)

            edge_tags = array_delimiter.join(edge_kind.tags)

            transformed = (transformed.withColumn(
                output_source_col,
                trim(transformed[input_source_col])).withColumn(
                    output_target_col,
                    trim(transformed[input_target_col])).withColumn(
                        output_tag_col, lit(edge_tags)))

            transformed = (transformed.dropna(
                how='any',
                subset=[output_source_col, output_target_col
                        ]).filter(transformed[output_source_col] != '').filter(
                            transformed[output_target_col] != ''))

            for dataframe in to_pandas_iterator(
                    transformed, max_result_size=max_result_size):
                yield dataframe
Ejemplo n.º 23
0
df.stat.crosstab("StockCode", "Quantity").show()
# 查看频繁项
df.stat.freqItems(["StockCode", "Quantity"]).show(2, False)
# 为每一行生成唯一ID
df.select(monotonically_increasing_id().alias("id")).show(2)
# 处理字符串类型
# initcap将空格分隔的字符串的单词首字母大写
df.select(initcap(col("Description"))).show()
# 字符串大小写转换
df.select(col("Description"), lower(col("Description")),
          upper(lower(col("Description")))).show(2)
# 字符串删除空格或者在其周围添加空格,lpad或rpad根据输入参数值与输入字符串长度比较,决定删除字符串长度
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)
# Spark使用这则表达式过滤字符串
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
    col("Description")).show(2, False)
# translate替换字符串,对比入参逐个字符替换,例如下面例子,L-1,E-3, T-7
df.select(translate(col("Description"), "LEET", "1337"), col("Description")) \
    .show(2, False)
# regexp_extract用于提取执行出现顺序的字符串,下面例子中extract_str任意单词出现在第1个位置则被提取
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
    col("Description")).show(2)
Ejemplo n.º 24
0
# getting document counts, where asin is considered the document
# doc_counts = reviews.groupBy('asin').count()
# print('doc counts')
# doc_counts.show()

# now to split up our reviewText into lines
# doc_and_lines = reviews.select('asin', fnc.split('reviewText', '[\W_]+').alias('a_line'))
# print('doc and lines')
# doc_and_lines.show()

# and now further split it into words
doc_and_words = reviews \
                .select('asin', fnc.explode(fnc.split('reviewText', '[\W_]+')).alias('each_word')) \
                .filter(fnc.length('each_word') >0) \
                .select('asin', fnc.trim(fnc.lower(fnc.col('each_word'))).alias('each_word'))\

# print('doc and words')
# doc_and_words.show()

# get counts of each word
# word_counts = doc_and_words.groupBy('each_word') \
#                 .count()
# print('word counts')
# word_counts.show()

# now to get term frequency using the formula of (term in a doc)/(total num of words in that doc)

wind = Window.partitionBy(doc_and_words['asin'])

tf = doc_and_words.groupBy('asin', 'each_word')\
Ejemplo n.º 25
0
def single_space(col):
    return F.trim(F.regexp_replace(col, " +", " "))
# In[3]:

from pyspark.sql.functions import col, expr, udf, trim
from pyspark.sql.types import IntegerType
import re

remove_punctuation = udf(lambda line: re.sub('[^A-Za-z\s]', '', line))
make_binary = udf(lambda rating: 0 if rating in [1, 2] else 1, IntegerType())

reviews = (all_reviews
    .na.fill({ 'reviewerName': 'Unknown' })
    .filter(col('overall').isin([1, 2, 5]))
    .withColumn('label', make_binary(col('overall')))
    .select(col('label').cast('int'), remove_punctuation('summary').alias('summary'))
    .filter(trim(col('summary')) != ''))


# ## Splitting data and balancing skewness

# In[4]:

train, test = reviews.randomSplit([.8, .2], seed=5436L)


# In[5]:

def multiply_dataset(dataset, n):
    return dataset if n <= 1 else dataset.union(multiply_dataset(dataset, n - 1))

from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate()

schema = StructType([ \
    StructField("id", IntegerType(), True), \
    StructField("name", StringType(), True)])

names = spark.read.schema(schema).option("sep", " ").csv("Marvel+Names")

lines = spark.read.text("Marvel+graph")

# Small tweak vs. what's shown in the video: we trim each line of whitespace as that could
# throw off the counts.
connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))

minConnectionCount = connections.agg(func.min("connections")).first()[0]

minConnections = connections.filter(
    func.col("connections") == minConnectionCount)
minConnectionsWithNames = minConnections.join(names, "id")

print("The following characters have only " + str(minConnectionCount) +
      " connections(s):")

minConnectionsWithNames.select("name").show()

spark.stop()
Ejemplo n.º 28
0
    StructField("OBSERVATION_TIME", StringType(), True) 
])

daily_all = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "false")
    .schema(schema_daily)
    .load("hdfs:///data/ghcnd/daily/")
)
daily_all.show(5, False)

# Extract YEAR from DATE
daily_all = (
    daily_all
    .withColumn('YEAR', F.trim(F.substring(F.col('DATE'), 1, 4)).cast(StringType()))
)
daily_all.show(5, False)

# Get a subset of daily with other elements
core_element = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']
daily_other = (
    daily_all
    .filter(~F.col('ELEMENT').isin(core_element))
)
daily_other.show(5, False)


# check the count of daily obersevation by element
daily_by_element = (
    daily_other
daily = (spark.read.format("com.databricks.spark.csv").option(
    "header", "false").option("inferSchema", "false").schema(
        schema_Daily).load("hdfs:///data/ghcnd/daily/2020.csv.gz").limit(1000))
daily.cache()
daily.show(5)

#----Q2-C----load metadata---------------
# load text
countries_text = (
    spark.read.format("text").load("hdfs:///data/ghcnd/countries"))
countries_text.show(5)

#parse
countries = countries_text.select(
    F.trim(F.substring(F.col('value'), 1, 2)).alias('CODE').cast(
        schema_Countries['CODE'].dataType),  #1-2
    F.trim(F.substring(F.col('value'), 4, 47)).alias('NAME').cast(
        schema_Countries['NAME'].dataType)  #4-50  3 space
)
countries.show(5)
countries.count()

inventory_text = (
    spark.read.format("text").load("hdfs:///data/ghcnd/inventory"))
inventory_text.show(5)
inventory = inventory_text.select(
    F.trim(F.substring(F.col('value'), 1,
                       11)).alias('ID').cast(schema_Inventory['ID'].dataType),
    F.trim(F.substring(F.col('value'), 13, 8)).alias('LATITUDE').cast(
        schema_Inventory['LATITUDE'].dataType),
    F.trim(F.substring(F.col('value'), 22, 9)).alias('LONGITUDE').cast(
Ejemplo n.º 30
0
def transform_airport_dataset(self):
       
        df_airport = spark.read.format("csv").option("header",True).load('../workspace/immigration_files/airport/airport-codes_csv.csv')

        not_null_iata_in_us_df = df_airport.where("iso_country = 'US' and iata_code is not null")

        not_null_iata_in_us_df = not_null_iata_in_us_df.withColumn("ident", trim(not_null_iata_in_us_df.ident)) \
                                                        .withColumn("type", trim(not_null_iata_in_us_df.type)) \
                                                        .withColumn("name", trim(not_null_iata_in_us_df.name)) \
                                                        .withColumn("elevation_ft", trim(not_null_iata_in_us_df.elevation_ft)) \
                                                        .withColumn("continent", trim(not_null_iata_in_us_df.continent)) \
                                                        .withColumn("iso_country", trim(not_null_iata_in_us_df.iso_country)) \
                                                        .withColumn("iso_region", trim(not_null_iata_in_us_df.iso_region)) \
                                                        .withColumn("municipality", trim(not_null_iata_in_us_df.municipality)) \
                                                        .withColumn("gps_code", trim(not_null_iata_in_us_df.gps_code)) \
                                                        .withColumn("iata_code", trim(not_null_iata_in_us_df.iata_code)) \
                                                        .withColumn("local_code", trim(not_null_iata_in_us_df.local_code)) \
                                                        .withColumn("coordinates", trim(not_null_iata_in_us_df.coordinates)) 


        not_null_iata_in_us_df=not_null_iata_in_us_df.withColumn("name", regexp_replace('name', "\\", ""))
        not_null_iata_in_us_df.write\
                              .csv(path = save_path + '/airport/',mode='overwrite', header=True)
Ejemplo n.º 31
0
#initiate glue context
glueContext = GlueContext(SparkContext.getOrCreate())

##Loading the source files from s3 bucket and converting the corresponding dynamic dataframes to apache spark data frames
Jas_Labour_Paid_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_paid_export.csv"]}, format="csv",format_options={'withHeader' : True})
Jas_Labour_Paid_df = Jas_Labour_Paid_dyf.toDF()

Jas_Labour_Production_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_production_export.csv"]}, format="csv",format_options={'withHeader' : True})
Jas_Labour_Production_df = Jas_Labour_Production_dyf.toDF()

##Performing Transformations on Jas_Labour_Paid_df
##Jas_Labour_Paid_df.printSchema()
Jas_Labour_Paid_df=Jas_Labour_Paid_df.select(col('EXTERNAL_ID').alias('EMPLOYEE_NUMBER'),\
col('USER_NAME').alias('EMPLOYEE_NAME'),'ATTENTION',\
trim(split(col('DEPT_NAME'),'-')[0]).alias('PROGRAM_DESC'),\
to_date(substring(col('TIME_START'),1,9),'dd-MMM-yy').alias('TRANSACTION_DATE'),\
col('HOURS_OVER_TIME').cast(DoubleType()).alias('AVAIL_OT_HRS'),\
col('HOURS_TOTAL').cast(DoubleType()).alias('HOURS_TOTAL'),'STATUS',\
col('HOURS_INDIRECT').cast(DoubleType()).alias('HOURS_INDIRECT'),\
col('HOURS_TIMED').cast(DoubleType()).alias('HOURS_TIMED'), 'TAC_CODE')\
.withColumn('DIRECT_INDIRECT', when(col('HOURS_INDIRECT')>0, 'INDIRECT').otherwise('DIRECT'))\
.withColumn('KEY_COL', concat(col('TRANSACTION_DATE'), lit('-'),col('EMPLOYEE_NUMBER')))\
.withColumn('WEEK_NUMBER', weekofyear(col('TRANSACTION_DATE')))\
.distinct()

Jas_Labour_Paid_df=Jas_Labour_Paid_df.filter(Jas_Labour_Paid_df['TRANSACTION_DATE'] >= lit("2019-01-01"))\
.filter(Jas_Labour_Paid_df['TRANSACTION_DATE'] <= lit("2019-02-21"))
##Jas_Labour_Paid_df.printSchema()

##Performing Transformations on Jas_Labour_Production_df
Ejemplo n.º 32
0
#Append and select data

# import pandas as pd
appended_data = add_category_fake.union(add_category_true)\
                                 .select(['category', 'text'])\
                                .dropna(subset=('text'))

# appended_data.show()

from pyspark.sql.functions import length, trim

# Create a length column to be used as a future feature
review_data = appended_data.withColumn('length', length(appended_data['text']))\
                            .where("length>=100")\
                            .orderBy('length')\
                            .withColumn("text", trim(appended_data.text))
# review_data.show()

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='category', outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
Ejemplo n.º 33
0
def main_cast():
    cast_csv = 'clean_dataset/Dataset_cast_recent.csv'
    # cast_csv = 'clean_dataset/Dataset_cast_past.csv'

    cast_df = spark.read.option("encoding", "UTF-8").load(cast_csv,
                                                          format="csv",
                                                          sep=",",
                                                          header="true",
                                                          escape='"').cache()
    cast_df = cast_df.select("actor_1", "actor_2", "actor_3", "actor_4",
                             "actor_5", "director", "id")
    print(cast_df.show(5))

    # replace 'Unknown' string with 'null' string in casts and directors
    cast_df = cast_df.withColumn(
        'actor_1',
        functions.when(cast_df['actor_1'] == 'Unknown',
                       'null').otherwise(cast_df['actor_1']))
    cast_df = cast_df.withColumn(
        'actor_2',
        functions.when(cast_df['actor_2'] == 'Unknown',
                       'null').otherwise(cast_df['actor_2']))
    cast_df = cast_df.withColumn(
        'actor_3',
        functions.when(cast_df['actor_3'] == 'Unknown',
                       'null').otherwise(cast_df['actor_3']))
    cast_df = cast_df.withColumn(
        'actor_4',
        functions.when(cast_df['actor_4'] == 'Unknown',
                       'null').otherwise(cast_df['actor_4']))
    cast_df = cast_df.withColumn(
        'actor_5',
        functions.when(cast_df['actor_5'] == 'Unknown',
                       'null').otherwise(cast_df['actor_5']))
    cast_df = cast_df.withColumn(
        'actor_5',
        functions.when(cast_df['actor_5'] == 'Unknown',
                       'null').otherwise(cast_df['actor_5']))
    cast_df = cast_df.withColumn(
        'director',
        functions.when(cast_df['director'] == 'Unknown',
                       'null').otherwise(cast_df['director']))

    # drop all empty cells (null) from directors
    cast_df = cast_df.filter(cast_df['director'].isNotNull())
    # drop all empty cells (null) from casts that have 5 empty cell in each actor
    cast_df = cast_df.filter((cast_df['actor_1'].isNotNull())
                             & (cast_df['actor_2'].isNotNull())
                             & (cast_df['actor_3'].isNotNull())
                             & (cast_df['actor_4'].isNotNull())
                             & (cast_df['actor_5'].isNotNull()))
    # drop all cell with 'null' or 'Unknown' string from director
    cast_df = cast_df.filter(cast_df['director'] != 'null')
    # drop all cell with 'null' or 'Unknown' string from all 5 casts
    cast_df = cast_df.filter((cast_df['actor_1'] != 'null')
                             & (cast_df['actor_2'] != 'null')
                             & (cast_df['actor_3'] != 'null')
                             & (cast_df['actor_4'] != 'null')
                             & (cast_df['actor_5'] != 'null'))

    ########### To get unique director for 'director' column ###################
    unique_director_df = get_unique_col('director', cast_df)
    list_top_unique_director = get_top_features('director', cast_df)
    '''
    ########### To get unique cast for each individual Column ##################
    unique_cast1_df = get_unique_col('actor_1',cast_df)
    list_top_unique_cast1= get_top_features('actor_1',unique_cast1_df)
    print(unique_cast1_df.show())
    print(list_top_unique_cast1)
    unique_cast2_df = get_unique_col('actor_2',cast_df)
    list_top_unique_cast2= get_top_features('actor_2',unique_cast2_df)
    print(unique_cast2_df.show())
    print(list_top_unique_cast2)
    unique_cast3_df = get_unique_col('actor_3',cast_df)
    list_top_unique_cast3= get_top_features('actor_3',unique_cast3_df)
    print(unique_cast3_df.show())
    print(list_top_unique_cast3)
    unique_cast4_df = get_unique_col('actor_4',cast_df)
    list_top_unique_cast4= get_top_features('actor_4',unique_cast4_df)
    print(unique_cast4_df.show())
    print(list_top_unique_cast4)
    unique_cast5_df = get_unique_col('actor_5',cast_df)
    list_top_unique_cast5= get_top_features('actor_5',unique_cast5_df)
    print(unique_cast5_df.show())
    print(list_top_unique_cast5)
    '''

    ########### To get unique cast for combined cast Columns ##################
    cast_df = cast_df.withColumn(
        'joined_column',
        functions.concat(functions.col('actor_1'), functions.lit(','),
                         functions.col('actor_2'), functions.lit(','),
                         functions.col('actor_3'), functions.lit(','),
                         functions.col('actor_4'), functions.lit(','),
                         functions.col('actor_5')))
    cast_df = cast_df.withColumn(
        'joined_col_temp', functions.split(functions.col('joined_column'),
                                           ','))
    unique_cast_df = cast_df.select(
        functions.explode('joined_col_temp').alias('unique_cast')).groupby(
            'unique_cast').count()
    unique_cast_df = unique_cast_df.withColumn(
        'unique_cast', functions.trim(functions.col('unique_cast')))
    unique_cast_df = unique_cast_df.groupby('unique_cast').agg({
        'count': 'sum'
    }).sort(functions.desc("sum(count)"))
    list_top_unique_casts = get_top_features('unique_cast', unique_cast_df)

    cast_df = cast_df.drop(cast_df.joined_column)
    cast_df = cast_df.drop(cast_df.joined_col_temp)

    cast_rdd = cast_df.rdd.map(list)
    ''' ######################################################################'''
    ''' #### 1st Function to random casts and 2nd function to pick top casts ###'''
    new_cast_rdd = cast_rdd.map(get_random_one_cast)
    # new_cast_rdd = cast_rdd.map(lambda j: get_top_cast(j, list_top_unique_casts))
    ''' ######################################################################### '''

    new_schema_cast_csv = StructType([
        StructField('cast', StringType(), True),
        StructField('director', StringType(), True),
        StructField('id', StringType(), True)
    ])
    new_cast_df = rdd_to_df(new_cast_rdd, new_schema_cast_csv)

    # outputs= 'new_clean_cast_dataset/single_cast_recent_top1000.csv'
    outputs = 'new_clean_cast_dataset/single_cast_recent_random.csv'
Ejemplo n.º 34
0
def transform_us_demo_dataset(self):
        df_us = spark.read.options(header='True', inferSchema='True', delimiter=';') \
              .csv("../workspace/immigration_files/demo/us-cities-demographics.csv")

        df_us = df_us.withColumnRenamed("Median Age", "Median_Age") \
                    .withColumnRenamed("Male Population","Male_Population") \
                    .withColumnRenamed("Female Population", "Female_Population") \
                    .withColumnRenamed("Total Population", "Total_Population") \
                    .withColumnRenamed("Number of Veterans", "Number_of_Veterans") \
                    .withColumnRenamed("Foreign-born", "Foreign_born") \
                    .withColumnRenamed("Average Household Size", "Average_Household_Size") \
                    .withColumnRenamed("State Code", "State_Code")

        df_us = df_us.withColumn("City", trim(df_us.City)) \
                    .withColumn("State", trim(df_us.State)) \
                    .withColumn("Median_Age", trim(df_us.Median_Age)) \
                    .withColumn("Male_Population", trim(df_us.Male_Population)) \
                    .withColumn("Female_Population", trim(df_us.Female_Population)) \
                    .withColumn("Total_Population", trim(df_us.Total_Population)) \
                    .withColumn("Number_of_Veterans", trim(df_us.Number_of_Veterans)) \
                    .withColumn("Foreign_born", trim(df_us.Foreign_born)) \
                    .withColumn("Average_Household_Size", trim(df_us.Average_Household_Size)) \
                    .withColumn("State_Code", trim(df_us.State_Code)) \
                    .withColumn("Race", trim(df_us.Race)) \
                    .withColumn("Count", trim(df_us.Count))    

        df_us.write\
        .csv(path = save_path + '/demo/',mode='overwrite', header=True)
# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------
Ejemplo n.º 36
0
def main_movie_with_rating():
    movie_csv = 'clean_dataset/Dataset_movie_with_rating_csv.csv'
    movie_df = spark.read.load(movie_csv,
                               format="csv",
                               sep=",",
                               header="true",
                               escape='"').cache()
    print(movie_df.show())
    # Find mean of revenue in non-zero row
    movie_df_1 = movie_df.filter(movie_df['revenue'] != 0)
    mean_revenue = movie_df_1.groupBy().agg(
        functions.avg(movie_df_1['revenue'])).collect()[0][0]
    # Find mean of budget in non-zero row
    movie_df_1 = movie_df.filter(movie_df['budget'] != 0)
    mean_budget = movie_df_1.groupBy().agg(functions.avg(
        movie_df_1['budget'])).collect()[0][0]
    # Find mean of avg_rating in specified conditions as below
    id_rating_df = movie_df.select('id', 'avg_rating')
    movie_df_1 = id_rating_df.filter(id_rating_df['avg_rating'] != 'null')
    movie_df_1 = movie_df_1.withColumn(
        "avg_rating", movie_df_1["avg_rating"].cast(DoubleType()))
    movie_df_1 = movie_df_1.filter(movie_df_1['avg_rating'] < 6)
    mean_rating = movie_df_1.groupBy().agg(
        functions.avg(movie_df_1['avg_rating'])).collect()[0][0]
    # replace null with the mean of avg_rating
    movie_df = movie_df.withColumn(
        'avg_rating',
        functions.when(movie_df['avg_rating'] != "",
                       movie_df['avg_rating']).otherwise(mean_rating))
    # replace 0 with the mean of revenue
    movie_df = movie_df.withColumn(
        'revenue',
        functions.when(movie_df['revenue'] == 0,
                       mean_revenue).otherwise(movie_df['revenue']))
    # replace 0 with the mean of budget
    movie_df = movie_df.withColumn(
        'budget',
        functions.when(movie_df['budget'] == 0,
                       mean_budget).otherwise(movie_df['budget']))
    # drop all empty cells (null) from release_date,production_companies,production_countries,genres
    movie_df = movie_df.filter(movie_df['release_date'].isNotNull())
    movie_df = movie_df.filter(movie_df['production_companies'].isNotNull())
    movie_df = movie_df.filter(movie_df['production_countries'].isNotNull())
    movie_df = movie_df.filter(movie_df['genres'].isNotNull())
    # drop all cell with 'null' string from release_date,production_companies,production_countries,genres
    movie_df = movie_df.filter((movie_df['release_date'] != 'null')
                               & (movie_df['release_date'] != '[null]'))
    movie_df = movie_df.filter((movie_df['production_companies'] != 'null') &
                               (movie_df['production_companies'] != '[null]'))
    movie_df = movie_df.filter((movie_df['production_countries'] != 'null') &
                               (movie_df['production_countries'] != '[null]'))
    movie_df = movie_df.filter((movie_df['genres'] != 'null')
                               & (movie_df['genres'] != '[null]'))

    movie_df = movie_df.withColumn(
        'genres_temp',
        functions.split(functions.regexp_extract('genres', '\[(.*)\]', 1),
                        ','))
    unique_genre = movie_df.select(
        functions.explode('genres_temp').alias('unique_genre')).groupby(
            'unique_genre').count()
    unique_genre = unique_genre.withColumn(
        'unique_genre', functions.trim(functions.col('unique_genre')))
    unique_genre = unique_genre.withColumn("unique_genre",
                                           blank_as_str_null("unique_genre"))
    unique_genre = unique_genre.groupby('unique_genre').agg({
        'count': 'sum'
    }).sort(functions.desc("sum(count)")).cache()
    list_top_unique_genre = get_top_features('unique_genre', unique_genre)

    movie_df = movie_df.withColumn(
        'production_companies_temp',
        functions.split(
            functions.regexp_extract('production_companies', '\[(.*)\]', 1),
            ','))
    unique_production_companies = movie_df.select(
        functions.explode('production_companies_temp').alias(
            'unique_production_companies')).groupby(
                'unique_production_companies').count()
    unique_production_companies = unique_production_companies.withColumn(
        'unique_production_companies',
        functions.trim(functions.col('unique_production_companies')))
    unique_production_companies = unique_production_companies.withColumn(
        "unique_production_companies",
        blank_as_str_null("unique_production_companies"))
    unique_production_companies = unique_production_companies.groupby(
        'unique_production_companies').agg({
            'count': 'sum'
        }).sort(functions.desc("sum(count)")).cache()
    list_top_unique_production_companies = get_top_features(
        'unique_production_companies', unique_production_companies)

    movie_df = movie_df.withColumn(
        'production_countries_temp',
        functions.split(
            functions.regexp_extract('production_countries', '\[(.*)\]', 1),
            ','))
    unique_production_countries = movie_df.select(
        functions.explode('production_countries_temp').alias(
            'unique_production_countries')).groupby(
                'unique_production_countries').count()
    unique_production_countries = unique_production_countries.withColumn(
        'unique_production_countries',
        functions.trim(functions.col('unique_production_countries')))
    unique_production_countries = unique_production_countries.withColumn(
        "unique_production_countries",
        blank_as_str_null("unique_production_countries"))
    unique_production_countries = unique_production_countries.groupby(
        'unique_production_countries').agg({
            'count': 'sum'
        }).sort(functions.desc("sum(count)")).cache()
    list_top_unique_production_countries = get_top_features(
        'unique_production_countries', unique_production_countries)

    movie_df = movie_df.drop(movie_df.production_companies_temp).drop(
        movie_df.production_countries_temp).drop(movie_df.genres_temp)
    movie_rdd = movie_df.rdd.map(list)

    list_top_movie_features = [
        list_top_unique_production_companies,
        list_top_unique_production_countries, list_top_unique_genre
    ]
    '''#######################################################################################################'''
    '''#### 1st Function to random production_companies and 2nd function to pick top production_companies ####'''
    # new_movie_rdd = movie_rdd.map(lambda j: get_random_movie_features(j))
    new_movie_rdd = movie_rdd.map(
        lambda j: get_top_movie_features(j, list_top_movie_features))
    '''####################################################################################################### '''

    new_schema_movie_csv = StructType([
        StructField('id', StringType(), True),
        StructField('title', StringType(), True),
        StructField('production_companies', StringType(), True),
        StructField('production_countries', StringType(), True),
        StructField('genres', StringType(), True),
        StructField('release_date', StringType(), True),
        StructField('revenue', StringType(), True),
        StructField('budget', StringType(), True),
        StructField('avg_rating', StringType(), True)
    ])
    new_movie_df = rdd_to_df(new_movie_rdd, new_schema_movie_csv)
    print(new_movie_df.show())

    meta_movie_csv = 'dataset/movies_metadata.csv'
    meta_movie_df = spark.read.load(meta_movie_csv,
                                    format="csv",
                                    sep=",",
                                    header="true",
                                    escape='"').cache()
    meta_movie_df = meta_movie_df.select("id", "runtime").withColumnRenamed(
        'id', 'id_meta')
    meta_movie_df = meta_movie_df.withColumn(
        'runtime_temp', meta_movie_df.runtime.cast(
            DoubleType())).drop('runtime').withColumnRenamed(
                'runtime_temp', 'runtime').cache()

    joined_movie_df = new_movie_df.join(
        meta_movie_df, new_movie_df.id == meta_movie_df.id_meta).drop(
            meta_movie_df.id_meta).cache()
    temp_joined_movie_df = joined_movie_df.filter(
        joined_movie_df['runtime'].isNotNull())
    joined_movie_df = joined_movie_df.withColumn(
        "temp",
        functions.when(functions.col('runtime').isNotNull(),
                       col('runtime')).otherwise(None)).drop('temp').cache()
    joined_movie_df = joined_movie_df.fillna(1000, subset=['runtime'])
    mean_runtime = temp_joined_movie_df.groupBy().agg(
        functions.avg(temp_joined_movie_df['runtime'])).collect()[0][0]
    joined_movie_df = joined_movie_df.withColumn(
        'runtime',
        functions.when(joined_movie_df['runtime'] == 1000,
                       mean_runtime).otherwise(joined_movie_df['runtime']))
    print(joined_movie_df.show())

    outputs = 'new_clean_dataset/clean_top_movie_with_rating_csv.csv'
    outputs = 'new_clean_dataset/clean_random_movie_with_rating_csv.csv'