from pyspark import SparkContext from pyspark.sql import SparkSession, types sparkSess = SparkSession.builder.appName('post_history').getOrCreate() sc = sparkSess.sparkContext bdschema = types.StructType([ types.StructField('index', types.IntegerType()), types.StructField('id', types.IntegerType()), types.StructField('creation_date', types.StringType()), types.StructField('post_id', types.IntegerType()), types.StructField('post_history_type_id', types.IntegerType()), types.StructField('user_id', types.IntegerType()) ]) sbad = sparkSess.read.format("s3selectCSV").schema(bdschema).options( header="true").options(delimiter="|").options( quote='\"').load("s3://bigdata-4/post_history.csv").select( "index", "id", "creation_date", "post_id", "post_history_type_id", "user_id") sbad.write.mode("append").parquet("s3://bigdata-4/post_history_new/")
def transform(self, sources: dict) -> DataFrame: """ Fact Invoice records and attributes from dataA Sources """ rpttax = self.read_source(source=sources['rpt_tax']) inv = self.invoice_dataframe(sources['dataB_urbcrb_invoice']) cer = self.read_source(source=sources['currency_exchange_master']) cer = fixCurExchangeToAvg(self, cer) rsc = self.read_source(source=sources['recycle_standard_cost']) rmr = self.read_source(source=sources['recycle_mill_rebates']) exclplnt = self.read_source(source=sources['rpt_exclplnt']) exclbillto = self.read_source(source=sources['rpt_exclbillto']) salesmanexc = self.read_source(source=sources['rpt_salesmanexc']) salesmanexc = self.addColPrefix(salesmanexc, "salesmanexc") otm_shipment = self.read_source(source=sources['shipment']) otm_shipment_cost = self.read_source(source=sources['shipment_cost']) otm_shipment_refnum = self.read_source( source=sources['shipment_refnum']) otm_shipment_status = self.read_source( source=sources['shipment_status']) otm_shipment_stop = self.read_source(source=sources['shipment_stop']) otm_shipment_stop_remark = self.read_source( source=sources['shipment_stop_remark']) freight_rate_val = self.read_source(source=sources['rate_validation']) freight_rate_plant_mapping = self.read_source( source=sources['plant_mapping']) freight_rate_slr = self.read_source( source=sources['supplemental_lane_rates']) dmat = self.read_source(source=sources['dim_material']) dmat = self.addColPrefix(dmat, "dmat") df_otm_freight = self.dataB_process_otm(otm_shipment, otm_shipment_cost, otm_shipment_refnum, otm_shipment_status, otm_shipment_stop, otm_shipment_stop_remark, inv) df_freight_rate_estimates = self.dataB_process_std_freight_rates( freight_rate_val, freight_rate_plant_mapping) df_freight_rate_estimates_slr = self.dataB_process_std_freight_rates_slr( freight_rate_slr) rmr = self.addColPrefix(rmr, "rmr") rsc = self.addColPrefix(rsc, "rsc") cer = self.addColPrefix(cer, "cer") rpttax = self.addColPrefix(rpttax, "rpttax") df_freight_rate_estimates = self.addColPrefix( df_freight_rate_estimates, "fre") df_freight_rate_estimates_slr = self.addColPrefix( df_freight_rate_estimates_slr, "fre_slr") df = inv.select( 'invoice_date', 'allowed_disc', 'bill_to', 'bus_unit', 'bus_unit_name', 'channel', 'sq_ft', 'line', 'caliper', 'charge_desc1', 'currency_code', 'curr_conv', 'extended_amount', 'bill_to_name', 'gen_ledg', 'grade', 'grade_desc', 'invoice', 'iptmeta_corrupt_record', 'iptmeta_extract_dttm', 'jde_bill_to', 'jde_ship_to', 'lbs', 'qty', 'length', 'order_format', 'plant', 'plant_name', 'salesman', 'salesman_name', 'substrate', 'ship_to', 'ship_to_outs_cvtr', 'width', 'price_uom_desc', 'bol_number', 'ship_to_city', 'ship_to_ship_to', 'qty_uom_desc', 'end_cust_desc', 'form', 'trans_mode', 'rept_month', 'rept_year') df = df.join(salesmanexc, [df.salesman == salesmanexc.salesmanexc_salesman], 'left_outer') df = df.withColumn( 'sales_representative', F.coalesce(df.salesmanexc_salesman_name_override, df.salesman_name, F.lit(MISSING_DESC))) df = df.withColumnRenamed('salesman', 'sales_rep_id') df = df.withColumn("billing_system", F.lit('dataB')) # The caliper needs to be calculated before creating the material id # because the caliper is used to build the material id. df = df.withColumn("caliper", F.col("caliper") * 1000.0) # remove records with calipers greater >= 200 df = self.dataB_filter_caliper(df) df = self.dataB_material_id(df) df = dataB_sale_form(df) df = df.withColumn("inv_date", F.col('invoice_date').cast(T.DateType())) df = df.withColumn("inv_month", F.month(F.col('inv_date'))) df = df.withColumn("inv_year", F.year(F.col('inv_date'))) df = df.withColumn("invoice_period", F.date_format(F.col("invoice_date"), "MMyyyy")) #Change format to MMyyyy df = df.withColumn("invoice_period", (df.invoice_period.cast( T.StringType()))[0:6]) #invoice_period lenght_max 6 rpttax = rpttax.withColumn("rpttax_plant", F.col('rpttax_plant').cast(T.IntegerType())) rpttax = rpttax.withColumnRenamed("rpttax_grade", "rpttax_grade_code") df = df.join(rsc, [ df.plant == rsc.rsc_plant, df.grade == rsc.rsc_grade_code, df.caliper == rsc.rsc_caliper, df.sale_form == rsc.rsc_ship_form ], 'left_outer') df = df.join(rpttax, [ df.grade == rpttax.rpttax_grade_code, df.plant == rpttax.rpttax_plant ], 'left_outer') df = df.join(rmr, [ df.plant == rmr.rmr_plant_id, df.invoice == rmr.rmr_invoice_id, df.line == rmr.rmr_invoice_line_number ], 'left_outer') df = df.join(cer, [ cer.cer_currency_code_from == df.currency_code, cer.cer_cur_year == df.inv_year, cer.cer_cur_month == df.inv_month ], 'left_outer') # Join for OTM to use estimates only df = df.join(df_otm_freight, [df.bol_number == df_otm_freight.bol_number_join], 'left_outer') # This code joins the freight rates but includes OTM checks so only # records without OTM matches are give values df = df.join(df_freight_rate_estimates, [ df.plant == df_freight_rate_estimates.fre_plant, F.lower(F.trim(df.ship_to_city)) == F.lower( F.trim(df_freight_rate_estimates.fre_dcity)), F.lower(F.trim(df.ship_to_ship_to)) == F.lower( F.trim(df_freight_rate_estimates.fre_dstate)), df_otm_freight.freight_rate_per_ton.isNull() ], 'left_outer') # This code joins the freight rates supplemental lanes but includes OTM checks # so only records that don't match OTM and the Freight Rates files are given # values df = df.join(df_freight_rate_estimates_slr, [ df.plant == df_freight_rate_estimates_slr.fre_slr_plant, F.lower(F.trim(df.ship_to_city)) == F.lower( F.trim(df_freight_rate_estimates_slr.fre_slr_dcity)), F.lower(F.trim(df.ship_to_ship_to)) == F.lower( F.trim(df_freight_rate_estimates_slr.fre_slr_dstate)), df_otm_freight.freight_rate_per_ton.isNull(), df_freight_rate_estimates.fre_estimate_freight_rate_per_ton.isNull( ) ], 'left_outer') # Select that includes OTM calculations df = df.select( df.allowed_disc, df.bill_to, df.bus_unit, df.bus_unit_name, df.charge_desc1, df.caliper, df.channel, df.width, df.length, df.currency_code, df.curr_conv, df.lbs, df.qty, df.material_id, df.sales_rep_id, df.sales_representative, df.billing_system, df.gen_ledg, df.extended_amount, df.grade, df.line, df.invoice, df.invoice_date, df.invoice_period, df.plant, df.plant_name, df.bill_to_name, df.ship_to_city, df.ship_to_ship_to, df.end_cust_desc, df.rept_month, df.rept_year, df.jde_ship_to, df.jde_bill_to, df.ship_to, df.ship_to_outs_cvtr, df.sq_ft, df.bol_number, df.trans_mode, rsc.rsc_msf, rmr.rmr_rebate_amount, rpttax.rpttax_end_market, rpttax.rpttax_grade_code, rpttax.rpttax_plant, rpttax.rpttax_product_family, rpttax.rpttax_product_group, rpttax.rpttax_product_name, rpttax.rpttax_substrate, cer.cer_conversion_rate_multiplier, df.price_uom_desc, df.qty_uom_desc, df_otm_freight.freight_rate_per_ton, df_freight_rate_estimates.fre_estimate_freight_rate_per_ton, df_freight_rate_estimates_slr.fre_slr_estimate_freight_rate_per_ton ) df = self.dataB_filter_plant(df, exclplnt) df = self.dataB_filter_billtoname_jdebillto(df, exclbillto) df = df.withColumn( 'actual_tons', F.coalesce((F.col('lbs') / 2000.0), F.lit(MISSING_NUMBER))) df = df.withColumn( 'fx_conversion_to_usd', F.coalesce( F.when((df.currency_code == 'USD') | (df.currency_code == '') | df.currency_code.isNull(), 1).otherwise( df.cer_conversion_rate_multiplier.cast( T.DoubleType())), F.lit(MISSING_NUMBER))) df = dataB_adjust_currency_fields(df) df = self.dataB_claims(df) df = self.dataB_discounts(df) # The following code block includes the OTM calculations for actual rates # Determine the approach for calculating the Freight Invoice value # and fill out the flag. Includes CPU filtering. df = df.withColumn( "freight_invoice_calc", F.when( F.lower(F.trim(df.trans_mode)) == F.lit('cpu'), F.lit('actual_cpu')).when(df.freight_rate_per_ton.isNotNull( ), F.lit('actual')).when( df.freight_rate_per_ton.isNull() & df.fre_estimate_freight_rate_per_ton.isNotNull(), F.lit('estimate')).when( df.freight_rate_per_ton.isNull() & df.fre_estimate_freight_rate_per_ton.isNull() & df.fre_slr_estimate_freight_rate_per_ton.isNotNull(), F.lit('estimate_slr')).otherwise( F.lit(NOT_APPLICABLE_CODE))) # Using the flag fill in the freight_invoice value. df = df.withColumn( "freight_invoice", F.when( df.freight_invoice_calc == F.lit('actual_cpu'), F.lit(0)).when( df.freight_invoice_calc == F.lit('actual'), df.freight_rate_per_ton * df.actual_tons).when( df.freight_invoice_calc == F.lit('estimate'), df.fre_estimate_freight_rate_per_ton * df.actual_tons).when( df.freight_invoice_calc == F.lit('estimate_slr'), df.fre_slr_estimate_freight_rate_per_ton * df.actual_tons).otherwise(F.lit(MISSING_NUMBER))) df = self.dataB_freight_upcharge(df) df = self.dataB_gross_price(df) df = df.withColumn('report_month', F.lpad(df.rept_month, 2, '0')) df = df.withColumnRenamed('rept_year', 'report_year') df = df.withColumn("other_deductions", F.lit(0)) df = self.dataB_rebates(df) df = df.withColumn("service_allowances", F.lit(0)) df = df.withColumn( 'net_price', F.coalesce( (F.col('gross_price') + F.col('discounts') + F.col('rebates') + F.col('claims') + F.col('freight_upcharge') + F.col('other_deductions') + F.col('service_allowances')), F.lit(MISSING_NUMBER))) df = df.withColumn('cp_channel', F.lit(0)) df = df.withColumn('cp_mode', F.lit(0)) df = df.withColumn('cp_sales_region', F.lit(0)) df = df.withColumnRenamed('invoice', 'invoice_number') df = df.withColumnRenamed('line', 'invoice_line_number') df = df.withColumnRenamed('plant_name', 'invoice_location') df = self.dataB_invoiced_currency(df) df = self.dataB_sale_type(df) df = self.dataB_ship_from_loc_number(df) df = df.withColumn("invoice_dim_location_id", F.expr(hash_columns(['plant']))) df = df.withColumn("ship_from_dim_location_id", F.expr(hash_columns(['ship_from_loc_number']))) df = df.withColumnRenamed('rpttax_end_market', 'end_market') df = prime_enrich(df, quality_class_column=None) df = df.withColumn('sales_order_number', F.lit('0')) df = df.withColumn( "ship1_dim_material_id", F.expr( hash_columns(['billing_system', 'material_id', 'end_market']))) df = df.withColumn( 'ship_dim_customer_id', F.expr( hash_columns( ['billing_system', 'jde_ship_to', 'end_cust_desc']))) df = df.withColumn( 'sold_dim_customer_id', F.expr( hash_columns( ['billing_system', 'jde_bill_to', 'end_cust_desc']))) df = df.withColumn('brand_dim_customer_id', F.lit(MISSING_STRING_ID)) # Joining the processed material dimension to retrieve the calculated nominal_basis_weight # value to be used to calculate nominal_tons. df = df.join(dmat, [df.ship1_dim_material_id == dmat.dmat_dim_material_id], 'left_outer') df = self.dataB_msf(df) df = df.withColumn( 'nominal_tons', F.coalesce(((df.dmat_nominal_basis_weight * df.msf) / 2000.0), F.lit(MISSING_NUMBER))) df = df.withColumn( 'subset', F.coalesce( F.when(df.rpttax_product_group.isNotNull(), df.rpttax_product_group), F.lit(MISSING_DESC))) df = df.withColumn( 'commercial_print_channel', F.coalesce(F.when(df.channel.isNotNull(), df.channel), F.lit(MISSING_DESC))) df = df.withColumn( 'invoice_location_number', F.coalesce(F.when(df.plant.isNotNull(), df.plant), F.lit(MISSING_NUMBER))) df = df.withColumn("invoice_source_type", F.lit(NOT_APPLICABLE_CODE)) df = df.withColumn("invoice_line_code", F.lit(NOT_APPLICABLE_CODE)) df = df.withColumn('iptmeta_source_system', F.lit('dataB')) df = self.dataB_product_sold_flag(df) df = df.withColumn("commercial_print_mode", F.lit(NOT_APPLICABLE_DESC)) df = df.withColumn("commercial_print_region", F.lit(NOT_APPLICABLE_DESC)) df = df.withColumnRenamed("qty", "invoice_volume") df = df.withColumnRenamed("qty_uom_desc", "invoice_uom_id") df = df.withColumn( 'standard_cost', F.coalesce( F.when(df.rsc_msf.isNotNull(), df.rsc_msf) * df.msf, F.lit(MISSING_NUMBER))) df = df.withColumn( 'standard_gross_margin', F.coalesce( (df.net_price - (df.standard_cost + df.freight_invoice)), F.lit(MISSING_NUMBER))) df = df.withColumn('invoice_line_desc_1', F.lit(NOT_APPLICABLE_CODE)) df = df.select( df.billing_system, df.invoice_number, df.invoice_line_number, df.invoice_period, df.invoice_source_type, df.invoice_line_code, df.iptmeta_source_system, df.product_sold_flag, df.commercial_print_channel, df.commercial_print_mode, df.fx_conversion_to_usd, df.grade, df.invoice_date, df.ship_from_dim_location_id, df.invoiced_currency, df.ship1_dim_material_id, df.prime, df.sales_order_number, df.sale_type, df.sales_representative, df.ship_dim_customer_id, df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset, df.actual_tons, df.claims, df.discounts, df.freight_invoice, df.freight_invoice_calc, df.freight_upcharge, df.gross_price, df.msf, df.net_price, df.nominal_tons, df.other_deductions, df.rebates, df.service_allowances, df.standard_cost, df.standard_gross_margin, df.invoice_dim_location_id, df.commercial_print_region, df.invoice_volume, df.invoice_uom_id, df.bol_number, df.report_month, df.report_year, df.sales_rep_id, df.invoice_line_desc_1).distinct() return df
def _to_stype(tpe) -> X: if _is_col(tpe): inner = as_spark_type(_get_col_inner(tpe)) return _Column(inner) inner = as_spark_type(tpe) if inner is None: return _Unknown(tpe) else: return _Scalar(inner) # First element of the list is the python base type _base = { types.StringType(): [str, 'str', 'string'], types.BinaryType(): [bytes], types.ByteType(): [np.int8, 'int8', 'byte'], types.ShortType(): [np.int16, 'int16', 'short'], types.IntegerType(): [int, 'int', np.int], types.LongType(): [np.int64, 'int64', 'long', 'bigint'], types.FloatType(): [float, 'float', np.float], types.DoubleType(): [np.float64, 'float64', 'double'], types.TimestampType(): [np.datetime64], types.DateType(): [datetime.date], types.BooleanType(): [bool, 'boolean', 'bool', np.bool], } def _build_type_dict(): return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
#!/usr/bin/env python3 """Script to convert `pg_dump` directory data into parquet data. This script also performs transformations to make the resulting aggregates easier to query within Spark and BigQuery.""" import click from pyspark.sql import Row, SparkSession from pyspark.sql import functions as F from pyspark.sql import types as T # This schema is an intermediate schema that is used METADATA_SCHEMA = T.StructType([ T.StructField("aggregate_type", T.StringType(), False), T.StructField("ds_nodash", T.StringType(), False), T.StructField("table_id", T.IntegerType(), False), ]) DIMENSION_SCHEMA = T.StructType([ T.StructField("os", T.StringType()), T.StructField("child", T.StringType()), T.StructField("label", T.StringType()), T.StructField("metric", T.StringType()), T.StructField("osVersion", T.StringType()), T.StructField("application", T.StringType()), T.StructField("architecture", T.StringType()), ]) AGGREGATE_SCHEMA = T.StringType() @click.command("pg_dump_to_parquet")
def main(inputs, output): comments_schema = types.StructType([ types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StructType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) comments = spark.read.json(inputs, schema=comments_schema) averages = comments.groupby('subreddit').agg( functions.avg(comments['score'])) #averages.explain() averages.write.csv(output, mode='overwrite')
import csv import pandas as pd from urllib.request import * import getCodeSets as codesets spark = SparkSession.builder.master("local[*]").config( "spark.executor.memory", "70g").config("spark.driver.memory", "50g").config( "spark.memory.offHeap.enabled", True).config("spark.memory.offHeap.size", "32g").config( "spark.driver.maxResultSize", "10g").appName("Load Labour Force Data").getOrCreate() #conf = SparkConf().setAppName('reddit etl') #sc = SparkContext(conf=conf) immigration_schema = types.StructType([ types.StructField('REF_DATE', types.StringType(), True), types.StructField('GEO', types.StringType(), True), types.StructField('In_migrants', types.IntegerType(), True), types.StructField('Out_migrants', types.IntegerType(), True), ]) # dtype={"REF_DATE": str, "GEO": str, "DGUID":str , "Labour force characteristics":str, "Sex":str, "Age group":str, \ #"Statistics":str, "Data type":str, "UOM":str, "UOM_ID":int, "SCALAR_FACTOR":str, "SCALAR_ID":int, "VECTOR":str, "COORDINATE":str, "VALUE":str, "STATUS":str, \ #"SYMBOL":str, "TERMINATE":str, "DECIMALS":int} def download_extract_zip(url): """ Download a ZIP file and extract its contents in memory yields (filename, file-like object) pairs """ response = requests.get(url)
StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count") ] assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')] pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf + tweetvect + ss + assembler) return pipeline if __name__ == "__main__": # create a SparkContext while checking if there is already SparkContext created try: sc = ps.SparkContext() sc.setLogLevel("ERROR") sqlContext = ps.sql.SQLContext(sc) print('Created a SparkContext') except ValueError: warnings.warn('SparkContext already exists in this scope') print('Retrieving Data from {}'.format(inputdir + "twitter_data.parquet")) df = sqlContext.read.parquet(inputdir + "twitter_data.parquet") reg_replaceUdf = f.udf(pre_processing, t.StringType()) df = df.withColumn('tweet', reg_replaceUdf(df.text)) print('Get Feature Vectors') pipeline = build_pipeline() pipelineFit = pipeline.fit(df) df = pipelineFit.transform(df) select_list = ["date_col", "features", "stock_price_col"] df = df.select([column for column in df.columns if column in select_list]) print("Write to Parquet") df.write.parquet(outputdir + "processed_twitter_pyspark") sc.stop()
def py_morphy(tokens): from nltk.corpus import wordnet as wn nltk.data.path.append('/home/dxiang/nltk_data') if not isinstance(tokens, list): tokens = [tokens] modified_tokens = [] for token in tokens: modified_token = wn.morphy(token) if modified_token is None: continue modified_tokens.append(modified_token) return modified_tokens udf_morphy = functions.udf(py_morphy, returnType=types.ArrayType(types.StringType())) def classify_tokens(list_tokens): from nltk.corpus import wordnet as wn nltk.data.path.append('/home/dxiang/nltk_data') if not isinstance(list_tokens, list): list_tokens = [list_tokens] list_token = [] for token in list_tokens: tag = wn.synsets(token)[0].pos( ) # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a/JJ', 's', 'r', 'n', 'v' if tag == 'n' and pos_tag([token])[0][1] == 'NN': noun = wn.synsets(token)[0] list_hypernyms = get_parent_classes(noun) if token == 'food' or token == 'drink' or 'food' in list_hypernyms or 'animal' in list_hypernyms or 'fruit' in list_hypernyms or 'alcohol' in list_hypernyms or 'beverage' in list_hypernyms:
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('reddit averages').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ wiki_schema = types.StructType([ types.StructField('lang', types.StringType()), types.StructField('page', types.StringType()), types.StructField('times_requested', types.LongType()), types.StructField('bytes', types.LongType()) ]) def get_date(string): file_name = string[string.rfind('/'):-1] date = file_name[file_name.find('-') + 1:file_name.rfind('-') + 3] return date udf = functions.UserDefinedFunction(lambda x: get_date(x), types.StringType()) def main(in_directory, out_directory): wiki_data = spark.read.csv(in_directory, sep=" ", schema=wiki_schema).withColumn( 'filename', functions.input_file_name())
import os from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import spacy from collections import Counter from string import punctuation sid = SentimentIntensityAnalyzer() #if you've downloaded the medium version use #nlp = spacy.load("en_core_web_md") #if you've downloaded the largest version use nlp = spacy.load("en_core_web_lg") get_twitch_schema = tp.StructType([ tp.StructField(name='username', dataType=tp.StringType(), nullable=True), tp.StructField(name='timestamp', dataType=tp.LongType(), nullable=True), tp.StructField(name='mex', dataType=tp.StringType(), nullable=True), tp.StructField(name='engagement', dataType=tp.FloatType(), nullable=True), tp.StructField(name='source', dataType=tp.StringType(), nullable=True) ]) def get_sentiment(text): value = sid.polarity_scores(text) value = value['compound'] return value def get_keyword(text): result = []
def list(self, provider, path=None, **kwargs): df_schema = T.StructType([ T.StructField('name', T.StringType(), True), T.StructField('type', T.StringType(), True) ]) df_empty = self.context.createDataFrame(data=(), schema=df_schema) md = Resource(path, provider, **kwargs) try: if md['service'] in ['local', 'file']: lst = [] rootpath = md['url'] for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type = 'FILE' elif os.path.isdir(fullpath): obj_type = 'DIRECTORY' elif os.path.islink(fullpath): obj_type = 'LINK' elif os.path.ismount(fullpath): obj_type = 'MOUNT' else: obj_type = 'UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['service'] in ['hdfs', 's3a']: sc = self.context._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem parsed = urnparse(md['url']) if md['service'] == 's3a': path = parsed.path.split('/') url = 's3a://' + path[0] path = '/' + '/'.join(path[1:]) if len(path) > 1 else '/' if md['service'] == 'hdfs': host_port = f"{parsed.host}:{parsed.port}" if parsed.port else parsed.hosts url = f'hdfs://{host_port}' path = '/' + parsed.path try: fs = FileSystem.get(URI(url), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(path)) except: logging.error(f'An error occurred accessing {url}{path}') obj = [] lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type = 'FILE' elif obj[i].isDirectory(): obj_type = 'DIRECTORY' else: obj_type = 'UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] table = md['table'] if database and table: try: obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", table) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .load() info = [(i.name, i.dataType.simpleString()) for i in obj.schema] except: info = [] if info: return self.context.createDataFrame( info, ['name', 'type']) if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [] for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect(): lst.append((x.TABLE_NAME, x.TABLE_TYPE)) if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df else: logging.error({ 'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented' }) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('reddit averages').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()),
from pyspark.ml.feature import SQLTransformer, VectorAssembler from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import SparkSession, functions, types import sys import datetime import numpy as np import elevation_grid as eg assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ spark = SparkSession.builder.appName('example code').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ spark.sparkContext.setLogLevel('WARN') sc = spark.sparkContext tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) DATASET = '/courses/732/tmax-test' MODEL = 'weather-model' DATE = datetime.date(2020, 2, 1) def change(): data = spark.read.csv(DATASET, schema=tmax_schema) data.createOrReplaceTempView('d')
.getOrCreate() spark_session.sparkContext.addFile('parse_tool.py') from parse_tool import parse_logs # User logs collection user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/") parsed_logs = user_logs.map(parse_logs) \ .map(lambda parse_res : [ parse_res[0] + '_' + parse_res[7], parse_res[3] ]) schema = tp.StructType().add("user_id", tp.StringType())\ .add("request_id", tp.StringType()) user_log_df = spark_session.createDataFrame(parsed_logs, schema) user_log_df_1 = user_log_df.alias("df_1") user_log_df_2 = user_log_df.alias("df_2") is_request_to_id = fn.udf(lambda line: line.startswith('/id'), tp.BooleanType()) top_5 = user_log_df_1.groupBy(user_log_df.user_id) \ .count() \ .orderBy(fn.desc("count")) \ .limit(100) \ .join(user_log_df_2, user_log_df_1.user_id == user_log_df_2.user_id) \
def infant_survival_mllib(): spark = SparkSession.builder.appName('infant-survival-mllib').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.StringType()), ('BIRTH_YEAR', types.IntegerType()), ('BIRTH_MONTH', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('MOTHER_RACE_6CODE', types.StringType()), ('MOTHER_EDUCATION', types.StringType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('FATHER_EDUCATION', types.StringType()), ('MONTH_PRECARE_RECODE', types.StringType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_BMI_RECODE', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.StringType()), ('DIABETES_GEST', types.StringType()), ('HYP_TENS_PRE', types.StringType()), ('HYP_TENS_GEST', types.StringType()), ('PREV_BIRTH_PRETERM', types.StringType()), ('NO_RISK', types.StringType()), ('NO_INFECTIONS_REPORTED', types.StringType()), ('LABOR_IND', types.StringType()), ('LABOR_AUGM', types.StringType()), ('STEROIDS', types.StringType()), ('ANTIBIOTICS', types.StringType()), ('ANESTHESIA', types.StringType()), ('DELIV_METHOD_RECODE_COMB', types.StringType()), ('ATTENDANT_BIRTH', types.StringType()), ('APGAR_5', types.IntegerType()), ('APGAR_5_RECODE', types.StringType()), ('APGAR_10', types.IntegerType()), ('APGAR_10_RECODE', types.StringType()), ('INFANT_SEX', types.StringType()), ('OBSTETRIC_GESTATION_WEEKS', types.IntegerType()), ('INFANT_WEIGHT_GRAMS', types.IntegerType()), ('INFANT_ASSIST_VENTI', types.StringType()), ('INFANT_ASSIST_VENTI_6HRS', types.StringType()), ('INFANT_NICU_ADMISSION', types.StringType()), ('INFANT_SURFACANT', types.StringType()), ('INFANT_ANTIBIOTICS', types.StringType()), ('INFANT_SEIZURES', types.StringType()), ('INFANT_NO_ABNORMALITIES', types.StringType()), ('INFANT_ANCEPHALY', types.StringType()), ('INFANT_MENINGOMYELOCELE', types.StringType()), ('INFANT_LIMB_REDUCTION', types.StringType()), ('INFANT_DOWN_SYNDROME', types.StringType()), ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', types.StringType()), ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', types.StringType()), ('INFANT_BREASTFED', types.StringType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_train.csv.gz', header=True, schema=schema) selected_features = [ 'INFANT_ALIVE_AT_REPORT', 'BIRTH_PLACE', 'MOTHER_AGE_YEARS', 'FATHER_COMBINED_AGE', 'CIG_BEFORE', 'CIG_1_TRI', 'CIG_2_TRI', 'CIG_3_TRI', 'MOTHER_HEIGHT_IN', 'MOTHER_PRE_WEIGHT', 'MOTHER_DELIVERY_WEIGHT', 'MOTHER_WEIGHT_GAIN', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM' ] births_trimmed = births.select(selected_features) recode_dictionary = {'YNU': {'Y': 1, 'N': 0, 'U': 0}} # Yes/No/Unknown. def recode(col, key): return recode_dictionary[key][col] def correct_cig(feat): return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0) rec_integer = func.udf(recode, types.IntegerType()) births_transformed = births_trimmed \ .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \ .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \ .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \ .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI')) cols = [(col.name, col.dataType) for col in births_trimmed.schema] YNU_cols = [] for i, s in enumerate(cols): if s[1] == types.StringType(): dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect() if 'Y' in dis: YNU_cols.append(s[0]) births.select(['INFANT_NICU_ADMISSION', rec_integer('INFANT_NICU_ADMISSION', func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE') ]).take(5) exprs_YNU = [rec_integer(x, func.lit('YNU')).alias(x) if x in YNU_cols else x for x in births_transformed.columns] births_transformed = births_transformed.select(exprs_YNU) births_transformed.select(YNU_cols[-5:]).show(5) # Calculate the descriptive statistics of the numeric features. numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE', 'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI', 'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT', 'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN' ] numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row]) mllib_stats = mllib_stat.Statistics.colStats(numeric_rdd) for col, m, v in zip(numeric_cols, mllib_stats.mean(), mllib_stats.variance()): print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v))) # Calculate frequencies for the categorical variables. categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols] categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row]) for i, col in enumerate(categorical_cols): agg = categorical_rdd.groupBy(lambda row: row[i]).map(lambda row: (row[0], len(row[1]))) print(col, sorted(agg.collect(), key=lambda el: el[1], reverse=True)) # Correlation. corrs = mllib_stat.Statistics.corr(numeric_rdd) for i, el in enumerate(corrs > 0.5): correlated = [(numeric_cols[j], corrs[i][j]) for j, e in enumerate(el) if e == 1.0 and j != i] if len(correlated) > 0: for e in correlated: print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1])) # Drop most of highly correlated features. features_to_keep = [ 'INFANT_ALIVE_AT_REPORT', 'BIRTH_PLACE', 'MOTHER_AGE_YEARS', 'FATHER_COMBINED_AGE', 'CIG_1_TRI', 'MOTHER_HEIGHT_IN', 'MOTHER_PRE_WEIGHT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM' ] births_transformed = births_transformed.select([e for e in features_to_keep]) #-------------------- # Statistical testing. # Run a Chi-square test to determine if there are significant differences for categorical variables. for cat in categorical_cols[1:]: agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT').pivot(cat).count() agg_rdd = agg.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect() row_length = len(agg.collect()[0]) - 1 agg = mllib_linalg.Matrices.dense(row_length, 2, agg_rdd) test = mllib_stat.Statistics.chiSqTest(agg) print(cat, round(test.pValue, 4)) #-------------------- # Machine learning. # Create an RDD of LabeledPoints. hashing = mllib_feature.HashingTF(7) births_hashed = births_transformed \ .rdd \ .map(lambda row: [list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE' else row[i] for i, col in enumerate(features_to_keep)]) \ .map(lambda row: [[e] if type(e) == int else e for e in row]) \ .map(lambda row: [item for sublist in row for item in sublist]) \ .map(lambda row: mllib_regression.LabeledPoint(row[0], mllib_linalg.Vectors.dense(row[1:]))) # Split into training and testing. births_train, births_test = births_hashed.randomSplit([0.6, 0.4]) # Estimate a logistic regression model using a stochastic gradient descent (SGD) algorithm. LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10) # Predict the classes for our testing set. LR_results = ( births_test.map(lambda row: row.label).zip(LR_Model.predict(births_test.map(lambda row: row.features))) ).map(lambda row: (row[0], row[1] * 1.0)) # Check how well or how bad our model performed. print('********************************************000') LR_evaluation = mllib_eval.BinaryClassificationMetrics(LR_results) print('********************************************001') print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR)) print('********************************************002') print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC)) print('********************************************003') LR_evaluation.unpersist() # Select the most predictable features using a Chi-Square selector. selector = mllib_feature.ChiSqSelector(4).fit(births_train) topFeatures_train = ( births_train.map(lambda row: row.label).zip(selector.transform(births_train.map(lambda row: row.features))) ).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1])) topFeatures_test = ( births_test.map(lambda row: row.label).zip(selector.transform(births_test.map(lambda row: row.features))) ).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1])) # Build a random forest model. RF_model = RandomForest.trainClassifier(data=topFeatures_train, numClasses=2, categoricalFeaturesInfo={}, numTrees=6, featureSubsetStrategy='all', seed=666) RF_results = (topFeatures_test.map(lambda row: row.label).zip(RF_model.predict(topFeatures_test.map(lambda row: row.features)))) RF_evaluation = mllib_eval.BinaryClassificationMetrics(RF_results) print('Area under PR: {0:.2f}'.format(RF_evaluation.areaUnderPR)) print('Area under ROC: {0:.2f}'.format(RF_evaluation.areaUnderROC)) RF_evaluation.unpersist() # See how the logistic regression would perform with reduced number of features. LR_Model_2 = LogisticRegressionWithLBFGS.train(topFeatures_train, iterations=10) LR_results_2 = ( topFeatures_test.map(lambda row: row.label).zip(LR_Model_2.predict(topFeatures_test.map(lambda row: row.features))) ).map(lambda row: (row[0], row[1] * 1.0)) LR_evaluation_2 = mllib_eval.BinaryClassificationMetrics(LR_results_2) print('Area under PR: {0:.2f}'.format(LR_evaluation_2.areaUnderPR)) print('Area under ROC: {0:.2f}'.format(LR_evaluation_2.areaUnderROC)) LR_evaluation_2.unpersist()
def main(keyspace, table): # create dataframes for order, lineitem and part df = spark.read.format("org.apache.spark.sql.cassandra").options( table='yelp_business', keyspace=keyspace).load() df.cache() # save as Allbusinesses table city_review = df.select('city', 'review_count').groupby('city').sum().orderBy( 'sum(review_count)', ascending=False).withColumnRenamed( 'sum(review_count)', 'ttl_reviews/City') # set up search grid around regions in Las Vegas # the final city we grab Las Vegas, North Las Vegas, Henderson, Boulder City, las vegas lat, lon = 36.181271, -115.134132 lat_range = 0.015 lon_range = 0.015 #DF = df.filter((df.city=='Las Vegas') | (df.city=='North Las Vegas')).select('latitude', 'longitude').orderBy('latitude') #DF.show() # save as champDF table DF = df.filter('latitude between {} and {}'.format( lat - lat_range, lat + lat_range)).filter('longitude between {} and {}'.format( lon - lon_range, lon + lon_range)).cache() las_vegas_df = DF.select('city', 'review_count').groupby('city').sum().orderBy( 'sum(review_count)', ascending=False).withColumnRenamed( 'sum(review_count)', 'ttl_reviews/las_vegas') DF.cache() #Split categories cate_rdd = DF.select('categories', 'business_id').rdd.map(lambda x: x[:]) # convert into a tuple of each category with one business_id categories = cate_rdd.flatMap(cate_tuple) # schemaString = "category business_id" # fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] # schema = StructType(fields) observation_schema = types.StructType([ types.StructField('category', types.StringType(), True), types.StructField('business_id', types.StringType(), True) ]) # save as categoryBusiness table categoryDF = spark.createDataFrame(categories, observation_schema) categoryDF = categoryDF.withColumn("cate_count", lit(1)) categoryDF.cache() categoryDF.createOrReplaceTempView("cate_restaurant") # Looking at all of the categories listed by frequency (each business can have multiple) df_cate_count = categoryDF.select( 'category', 'cate_count').groupby('category').sum().orderBy( 'sum(cate_count)', ascending=False).withColumnRenamed('sum(cate_count)', 'count') # filter business with categories as food or restaurants food_rest_df = spark.sql( "SELECT count(*) AS num_category_restaurants FROM cate_restaurant WHERE lower(category) LIKE '%food%' OR lower(category) LIKE '%restaurant%'" ) # save table as foodbusiness food_rest_business = spark.sql( "SELECT count(category) AS num_category_restaurants, business_id FROM cate_restaurant WHERE lower(category) LIKE '%food%' OR lower(category) LIKE '%restaurant%' GROUP BY business_id" ) # saved as businessFoodOnly table business_food_rest_df = DF.join(food_rest_business, "business_id", "right") business_food_rest_df.groupby('state').count() # convert each attributes with business_id attri = business_food_rest_df.select('attributes', 'business_id').rdd.map(lambda x: x[:]) attri_restaurant = attri.flatMap(lambda x: att_time_split(x)) schema_1 = types.StructType([ types.StructField('attributes', types.StringType(), True), types.StructField('business_id', types.StringType(), True) ]) attri_df = spark.createDataFrame(attri_restaurant, schema_1) # Extract dictionaries from attributes column attri_df2 = attri_df.rdd.map(lambda x: x[:]).flatMap( lambda x: dict_split(x)) schema_2 = types.StructType([ types.StructField('attribute', types.StringType(), True), types.StructField('attribute_value', types.StringType(), True), types.StructField('business_id', types.StringType(), True) ]) attri_df3 = spark.createDataFrame(attri_df2, schema_2) # saved as attributeFinal hours_rdd = business_food_rest_df.select('hours', 'business_id').rdd.map( lambda x: x[:]).flatMap(lambda x: att_time_split(x)) schema_3 = types.StructType([ types.StructField('hours', types.StringType(), True), types.StructField('business_id', types.StringType(), True) ]) # saves as hoursBusiness table hours_df = spark.createDataFrame(hours_rdd, schema_3) hours_df1 = hours_df.groupby('hours').count().orderBy('count', ascending=False) # clean hours column hours_rdd1 = hours_df.rdd.map(lambda x: x[:]).flatMap( lambda x: hours_split(x)) schema_5 = types.StructType([ types.StructField('day', types.StringType(), True), types.StructField('opening_hour', types.FloatType(), True), types.StructField('closing_hour', types.FloatType(), True), types.StructField('business_id', types.StringType(), True) ]) # saved as openCloseBusiness table hours_df2 = spark.createDataFrame(hours_rdd1, schema_5) # most popular opening hours popular_hour_df = hours_df.groupby('day', 'opening_hour').count().orderBy( 'count', ascending=False) # Check-in dataset cleaning (saved as checkinAll table) df_checkin = spark.read.format("org.apache.spark.sql.cassandra").options( table='yelp_checkin', keyspace=keyspace).load() df_checkin.cache() checkin_rdd = df_checkin.select('time', 'business_id').rdd.map( lambda x: x[:]).flatMap(lambda x: att_time_split(x)) schema_5 = types.StructType([ types.StructField('checkin', types.StringType(), True), types.StructField('business_id', types.StringType(), True) ]) # saved as checkinCount table checkin_df = spark.createDataFrame(checkin_rdd, schema_5) # each business separated checkin hours count checkin_count_df = checkin_df.groupby('business_id').count().orderBy( 'business_id', ascending=False).withColumnRenamed('count', 'num_checkin') # merge num of checkins to business df (saved as cleanBusiness table) 190 restaurants in total cleanBusinessDF = business_food_rest_df.join( checkin_count_df, 'business_id', 'left').drop('hours', 'categories', 'attributes', 'type', 'is_open') df_review = spark.read.format("org.apache.spark.sql.cassandra").options( table='yelp_review', keyspace=keyspace).load() review_lasvegas_DF = df_review.join( cleanBusinessDF, 'business_id', 'right').drop(cleanBusinessDF['stars']).drop( 'address', 'latitude', 'longitude', 'postal_code', 'review_count', 'state', 'num_category_restaurants', 'num_checkin') if table == 'yelp_business_lasvegas': cleanBusinessDF.repartition(300).write.format( "org.apache.spark.sql.cassandra").options( table=table, keyspace=keyspace).save() elif table == 'yelp_review_lasvegas': review_lasvegas_DF.repartition(300).write.format( "org.apache.spark.sql.cassandra").options( table=table, keyspace=keyspace).save()
def get_cancellations(self): """get cancellation data, lowest dimension is at campaign level, so only the campaign id is parsed from label. The hotel id is not parsed because we are interested in which hotels that got booked in the end get cancelled, not those hotels that get clicked (parsing from the label for hotel id correspond to notion of last click attribution) returns: spark dataframe """ def extract_aff_label(aff_name, info_type): """ function copied from the account_1stats, this is not ideal as this is a function embedded in another function # udf to obtain the cc, device and placement from the affiliate name # note that the cc from this table contains options like AOW and ROW # very likely there's no match here """ try: if info_type == "cc": return aff_name.split(".")[0].split("_")[1] elif info_type == "placement": placement = aff_name.split(".")[1] if placement == "LU": return "localuniversal" elif placement == "MR": return "mapresults" else: return None elif info_type == "device": device = aff_name.split(".")[2] if device == "PC": return "desktop" elif device in ("Mobile", "Tablet"): return device.lower() else: return None else: return None except: return None spark.udf.register("extract_aff_label", extract_aff_label, returnType=t.StringType()) # return everything as StringType first, # will correct for this later on def extract_res_label(label): """function copied from the account_1stats, this is not ideal as this is a function embedded in another function, this is a udf to extract relevant information from label of reservations """ temp = label.split("_") info_dict = {} for x in temp: data = x.split("-") if len(data) == 2: info_dict[data[0]] = data[1] else: if "mapresults" in x.lower(): info_dict["placement"] = "mapresults" elif "localuniversal" in x.lower(): info_dict["placement"] = "localuniversal" if "hotel-" in x.lower(): try: info_dict["hotel_id"] = x.split("hotel-")[1] except: info_dict["hotel_id"] = None return info_dict spark.udf.register("extract_res_label", extract_res_label, returnType=t.MapType(t.StringType(), t.StringType())) aff_id = spark.table(self.affiliate_table)\ .where("partner_id = 423463")\ .selectExpr("id as affiliate_id" ,"extract_aff_label(name,'cc') aff_cc" ,"extract_aff_label(name,'placement') aff_placement" ,"extract_aff_label(name,'device') aff_device") # get cancelled reservation between start and end_date cancelled_reservations = ( spark.table(self.reservation_table).withColumn( "date_cancelled", f.expr("to_date(date_cancelled)") ).withColumnRenamed("id", "hotelreservation_id").where( "date_cancelled between '{start_date}' and '{end_date}'". format(start_date=self.start_date, end_date=self.end_date)).join( spark.table("fpa.device_class_lookup").select( "hotelreservation_id", "device_class"), on="hotelreservation_id", how="inner"). where("status not in ('fraudulent', 'test', 'unknown')").join( f.broadcast(aff_id), on="affiliate_id", how="inner").selectExpr("date_cancelled", "label", "upper(booker_cc1) booker_cc1", "hotelreservation_id", "hotel_id", "aff_cc", "roomnights", "commission_amount_euro")) # grab information from the label cancelled_reservations_label = (cancelled_reservations.withColumn( "label_map", f.expr("extract_res_label(label)") ).withColumn( "label_cid", f.expr( "cast(coalesce(label_map['cid'],get_cid(label_map['ucc'])) as int)" )).drop("label_map").cache()) # only keep the coalescing of campaign id and select only relevant columns can_res_cleaned = (cancelled_reservations_label.selectExpr( "hotelreservation_id", "to_date(date_cancelled) yyyy_mm_dd", "hotel_id", "coalesce(label_cid,get_cid(aff_cc),get_cid(booker_cc1),66) campaign_id", "commission_amount_euro cancelled_commission", "roomnights cancelled_roomnights", "1 cancellations")) # filter for relevant campaigns account_1_campaign = self.get_id_pos() can_res_cleaned = can_res_cleaned.join(account_1_campaign, on="campaign_id", how="inner") cancellations_agg = can_res_cleaned.groupBy(*self.agg_on)\ .agg(f.sum("cancelled_commission").alias("cancelled_commission") ,f.sum("cancelled_roomnights").alias("cancelled_roomnights") ,f.sum("cancellations").alias("cancellations")) return cancellations_agg
import sys assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('temp_range_sql').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ observation_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.StringType()), types.StructField('observation', types.StringType()), types.StructField('value', types.IntegerType()), types.StructField('mflag', types.StringType()), types.StructField('qflag', types.StringType()), types.StructField('sflag', types.StringType()), types.StructField('obstime', types.StringType()), ]) def main(inputs, output): weather = spark.read.csv(inputs, schema=observation_schema) weather.createOrReplaceTempView('weather') filter_weather = spark.sql( "SELECT date, station, observation, value FROM weather WHERE qflag IS NULL" ) filter_weather.createOrReplaceTempView('filter_weather') max_weather = spark.sql( "SELECT * FROM filter_weather WHERE observation = 'TMAX' ") max_weather.createOrReplaceTempView('max_weather')
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): if (hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] and hasattr(tpe, "__args__") and len(tpe.__args__) > 1 # type: ignore[union-attr] ): # numpy.typing.NDArray return types.ArrayType( as_spark_type( tpe.__args__[1].__args__[0], raise_error=raise_error # type: ignore[union-attr] )) if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass( tpe.__origin__, list # type: ignore[union-attr] ): element_type = as_spark_type( tpe.__args__[0], raise_error=raise_error # type: ignore[union-attr] ) if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType( ) if prefer_timestamp_ntz else types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
print("Usage: spark-submit tweetconsumer.py <hostname> <port> <topic>") print("eg. spark-submit --packages org.apache.spark:spark-sql\ -kafka-0-10_2.11:2.4.5 tweetconsumer.py localhost 9092 twitter") sys.exit(1) host = sys.argv[1] port = sys.argv[2] topic = sys.argv[3] connect_string = host + ":" + port spark = SparkSession.builder.appName("TweetConsumer").getOrCreate() spark.sparkContext.setLogLevel("ERROR") schema = t.StructType() \ .add("id", t.LongType()) \ .add("full_text", t.StringType()) \ .add("len", t.IntegerType()) \ .add("in_reply_to_status_id", t.StringType()) \ .add("date", t.StringType()) \ .add("source", t.StringType()) \ .add("likes", t.IntegerType()) \ .add("retweet", t.IntegerType()) \ .add("sent_by", t.StringType()) \ .add("friend_of", t.StringType()) \ .add("hash_tag", t.StringType()) \ tweetsRawDF = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \
def read_sas_csv(path_raw_data, spark): """ This function read sas file and csv, with functions in read_file1.py return dataframe """ try: # df_immigration #print('_____df_imigration____') cols = [ 'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'i94mode', 'i94addr', 'i94bir', 'i94visa', 'dtadfile', 'gender', 'airline', 'visatype' ] file = '18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat' # todo :refaire avec S3 et tous les fichiers (get_path_sas_folder parquet file) df_immigration = read_sas(spark, path_raw_data, file, cols) # df_temperature #print('_____df_temperature____') cols = ['AverageTemperature', 'City', 'Country'] file = 'GlobalLandTemperaturesByCity.csv' delimiter = ',' df_temperature = read_csv(spark, path_raw_data, file, cols, delimiter) # df_airport_code #print('_____df_airport_code____') file = 'airport-codes_csv.csv' cols = [ 'ident', 'type', 'name', 'iso_country', 'iso_region', 'municipality', 'iata_code', 'local_code' ] delimiter = ',' df_airport_code = read_csv(spark, path_raw_data, file, cols, delimiter) # df_global_airports #print('_____df_global_airports____') file = 'airports-extended.csv' cols = ['airport_ID', 'type', 'name', 'city', 'country', 'iata'] delimiter = ',' #header = False schema = T.StructType([ T.StructField('airport_ID', T.IntegerType(), False), T.StructField('name', T.StringType(), False), T.StructField('city', T.StringType(), False), T.StructField('country', T.StringType(), False), T.StructField('iata', T.StringType(), False), T.StructField('icao', T.StringType(), False), T.StructField('latitude', T.StringType(), True), T.StructField('longitude', T.StringType(), True), T.StructField('altitude', T.IntegerType(), True), T.StructField('timezone', T.StringType(), True), T.StructField('dst', T.StringType(), True), T.StructField('tz_timezone', T.StringType(), True), T.StructField('type', T.StringType(), True), T.StructField('data_source', T.StringType(), True) ]) df_global_airports = read_csv_global_airports(spark, path_raw_data, file, cols, delimiter, schema, header=False) # df_iso_country #print('_____df_iso_country____') file = 'wikipedia-iso-country-codes.csv' #cols = ['Country', 'Alpha_2','Alpha_3', 'Num_code', 'ISO_3166-2'] #delimiter =',' #file = 'wikipedia-iso-country-codes.csv' df_iso_country = read_csv_iso_country(spark, path_raw_data, file) # df_demograph #print('_____df_demograph____') file = 'us-cities-demographics.csv' cols = [ 'City', 'State', 'Median Age', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born', 'Average Household Size', 'State Code', 'Race', 'Count' ] delimiter = ';' df_demograph = read_csv(spark, path_raw_data, file, cols, delimiter) # df_indicator_dev #print('_____df_indicator_dev____') file = 'WDIData.csv' delimiter = ',' header = False cols = [ 'Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2015' ] df_indicator_dev = read_csv(spark, path_raw_data, file, cols, delimiter) return (df_immigration, df_temperature, df_airport_code, df_global_airports, df_iso_country, df_demograph, df_indicator_dev) except Exception as e: print("Unexpected error in read_sas_csv: %s" % e) sys.exit()
def extract(self, catalog: Dict[str, Any]) -> Dict[str, pyspark.sql.DataFrame]: """Extracts the files for this job. The default implementation uses the inputs dict structure: - Files are loaded from the staging directory - Tables are loaded from the data-lake repository More complex jobs should override this method (e.g. VSAM files) Parameters ---------- Returns ------- dict a dict where the keys are aliases of the dataframe and the values are DataFrameReaders """ self._logger.info("extract start") inputs: Dict[str, pyspark.sql.DataFrame] = {} for alias, properties in self.sources.items(): # # load each source # if properties["type"] == "file": # get the entry from the catalog if properties["source"] not in catalog: raise ValueError( f"{properties['source']} not found in catalog") source: Dict[Any, Any] = catalog[properties["source"]] file_locations, all_files = common.utils.get_file_locations( self._env["file_prefix"], source["path"], limit=properties.get("limit", 1), sort=properties.get("sort", 'last_modified'), ascending=properties.get("ascending", True)) self._logger.debug("loading %s" % file_locations) self._processed_files.update(all_files) # type: ignore # custom if source["format"] == "custom": continue # parse text if (source["format"] == "txt"): df_input_segments = [] for file_location in file_locations: raw = self._spark.sparkContext.textFile(file_location) footer_line = source.get("skip_footer_rows", 0) if (int(source.get("skip_header_rows", 0)) > 0 or int(source.get("skip_footer_rows", 0)) > 0): rdd = raw.zipWithIndex()\ .filter(lambda line_index: line_index[1] >= int(source.get("skip_header_rows", 0))) if int(source.get("skip_footer_rows", 0)) > 0: line_count = rdd.count() - 1 rdd = rdd\ .filter(lambda line_index: line_index[1] <= line_count-footer_line) rdd = rdd.map(lambda row: row[0]) reader = rdd.map(lambda row: row.split( source.get("delimiter", ","))) # parse columns l = [] for column, metadata in source.get("columns", {}).items(): pos = "_" + str(metadata["position"]) l.append(pos) reader_cols = reader.toDF().select(l).collect() fields = [] for column, coltype in source.get("columns", {}).items(): if coltype == 'Integer': Field = T.StructField(column, T.IntegerType()) elif coltype == 'Date': Field = T.StructField(column, T.DateType()) elif coltype == 'Double': Field = T.StructField(column, T.DoubleType()) else: Field = T.StructField(column, T.StringType()) fields.append(Field) schema = T.StructType(fields) df_input_segment = self._spark.createDataFrame( reader_cols, schema) df_input_segments.append(df_input_segment) # parse CSV if (source["format"] == "csv"): df_input_segments = [] for file_location in file_locations: # TODO enhance conf to support all options reader = self._spark.read\ .option("inferSchema", "true")\ .option("header", source.get("header",True))\ .option("quote", "\"")\ .option("escape", "\"")\ .option("multiLine", "true")\ .option("mode","DROPMALFORMED")\ .option("ignoreTrailingWhiteSpace", True)\ .option("ignoreLeadingWhiteSpace", True)\ .option("delimiter",source.get("delimiter",",")) if (int(source.get("skip_header_rows", 0)) > 0 or int(source.get("skip_footer_rows", 0)) > 0): df_input_segment = common.parsers.read_csv_remove_header_footer( self._spark, file_location, reader, source.get("skip_header_rows", 0), source.get("skip_footer_rows", 0)) else: df_input_segment = reader.csv(file_location) # if we didn't get a header, take it from the metadata and rename the columns if not source.get("header", True): df_input_segment = df_input_segment.toDF( *source.get("columns", {}).keys()) # drop records that are completely null df_input_segments.append(df_input_segment) elif source["format"] == "cobol": df_input_segments = [] for file_location in file_locations: df_input = common.parsers.read_cobol_file( self._spark, file_location, copybook_location=source.get("copybook", None), row_prefix=source.get( "row_prefix", None ), # if multiline, specify start of new row multiline=source.get( "multiline", False ), # if has a row prefix this is multiline file record_selector_field=source.get( "record_selector_field", None), record_types=source.get("record_types", None), header_lines=source.get("skip_header_rows", 0), footer_lines=source.get("skip_footer_rows", 0), use_header=source.get("skip_header", False), trim=source.get("trim", False)) df_input_segments.append(df_input) # reduce the input segments of multiple files to a single dataframe df_input = reduce(pyspark.sql.DataFrame.unionAll, df_input_segments) # parse any columns for column, metadata in source.get("columns", {}).items(): if metadata["type"] == "date": df_input = df_input.withColumn( column, F.to_date(F.col(column).cast("string"), format=metadata["format"])) elif properties["type"] == "table": self._logger.debug( f"checking for existing table => {properties['source']}") try: # read from the data mart snapshot mirror files df_input = common.utils.read_table_snapshot( table_name=properties["source"], env=self._env, spark=self._spark) except FileNotFoundError: # create an empty data frame with dummy schema self._logger.debug( 'existing table not found, creating an empty one') # first, we import the job to read its target mappings # for this, we traverse all jobs to find the one that populates this target table found = False for importer, modname, ispkg in pkgutil.walk_packages( path=jobs.__path__, prefix='jobs.'): # type: ignore if not ispkg: # find the job class job_name = ".".join( modname.split(".")[1:-1] ) # take the middle part of the job package (without prefix or suffix) job_module = importlib.import_module( "jobs.%s.job" % job_name) job_class = getattr(job_module, "Job") job_target_table = getattr(job_class, "target_table") if job_target_table == properties["source"]: found = True break if not found: # there is no way to populate this table from source jobs raise ValueError( f"table parquet for {properties['source']} not found and we can't find a job to populate its schema" ) job_target_mappings: List[Dict[str, Any]] = getattr( job_class, "target_mappings") # create schema for empty dataframe by reading the target mappings and business keys metadata_columns = [ T.StructField("row_strt_dttm", T.TimestampType()), T.StructField("row_stop_dttm", T.TimestampType()), T.StructField("curr_row_flg", T.StringType()) ] schema = T.StructType([ T.StructField(mapping["target"], T.StringType()) for mapping in job_target_mappings ] + metadata_columns) # add primary key for colname, coltype in job_class.primary_key.items(): if not colname in map(lambda x: x["target"], job_target_mappings): schema.add(colname, T.IntegerType()) df_input = self._spark.createDataFrame( self._spark.sparkContext.emptyRDD(), schema) elif properties["type"] == "dimension": # this is an internal 'dimension' table. load as csv from metadata folder file_location = pkg_resources.resource_filename( "metadata.dimension_tables", f"{properties['source'].lower()}.csv") # load into an RDD reader = self._spark.read\ .option("inferSchema", "true")\ .option("header", True)\ .option("quote", "\"")\ .option("escape", "\"")\ .option("multiLine", "true")\ .option("ignoreTrailingWhiteSpace", True)\ .option("delimiter",",") df_input = reader.csv(file_location) inputs[alias] = df_input.alias(alias) self._logger.debug("extract done") return inputs
def get_dataset( dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True, table_name=None, sqlite_db_path=None, ): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": if not create_engine: return None if sqlite_db_path is not None: engine = create_engine(f"sqlite:////{sqlite_db_path}") else: engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": if not create_engine: return None # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": if not create_engine: return None engine = create_engine("mysql+pymysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mssql": if not create_engine: return None engine = create_engine( "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true", # echo=True, ) # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit" # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful). # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True)) conn = engine.connect() sql_dtypes = {} if (schemas and dataset_type in schemas and isinstance(engine.dialect, mssqltypes.dialect)): schema = schemas[dataset_type] sql_dtypes = { col: MSSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def infant_survival_ml(): spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # Create a pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(births_train) test_model = model.transform(births_test) print(test_model.take(1)) # Evaluate the performance of the model. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # Save the Pipeline definition. pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # Load the Pipeline definition. loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # Save the PipelineModel. modelPath = './infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) # Load the PipelineModel. loadedPipelineModel = PipelineModel.load(modelPath) test_reloadedModel = loadedPipelineModel.transform(births_test) print(test_reloadedModel.take(1))
def get_common_spark_testing_client(data_directory, connect): pytest.importorskip('pyspark') import pyspark.sql.types as pt from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() _spark_testing_client = connect(spark) s = _spark_testing_client._session df_functional_alltypes = s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType( [ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ] ), mode='FAILFAST', header=True, ) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean") ) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType( [ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ] ), header=True, ) df_batting.createOrReplaceTempView('batting') df_awards_players = s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType( [ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ] ), header=True, ) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'),)], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], {'a': [[2, 4], [3, 5]]})], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame( [({(1, 3): [[2, 4], [3, 5]]},)], ['map_tuple_list_of_list_of_ints'] ) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame( { 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), } ) ) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [ (float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10) ], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame( { 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), } ) ) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def train_validation_splitting_ml(): spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Select only the top five features. selector = ml_feature.ChiSqSelector( numTopFeatures=5, featuresCol=featuresCreator.getOutputCol(), outputCol='selectedFeatures', labelCol='INFANT_ALIVE_AT_REPORT' ) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, selector]) data_transformer = pipeline.fit(births_train) # Create LogisticRegression and Pipeline. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a TrainValidationSplit object. tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) # Fit our data to the model. tvsModel = tvs.fit(data_transformer.transform(births_train)) data_train = data_transformer.transform(births_test) # Calculate results. results = tvsModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
spark = SparkSession.builder\ .master("local[*]")\ .appName("test.dataframe")\ .getOrCreate() # df = spark \ # .readStream \ # .format("kafka") \ # .option("kafka.bootstrap.servers", "10.12.64.205:9092") \ # .option("subscribe", "greetings") \ # .load() # df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") # 第1步,加载数据,默认为字符串类型的单列,列名为value data = ['[{"a":1,"b":2},{"a":3,"b":4},{"a":5,"b":6},{"a":7,"b":8}]'] df = spark.createDataFrame(data, T.StringType()) df.printSchema() df.show() schema = T.ArrayType( T.StructType([ T.StructField("a", T.IntegerType()), T.StructField("b", T.IntegerType()) ])) # 第2步,将列转为数组类型 df = df.select(F.from_json(df["value"], schema).alias("json")) df.printSchema() df.show() # 第3步,将列转为Struct类型
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])
import requests import sys assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from zipfile import ZipFile from pyspark.sql import SparkSession, functions, types from io import * import csv import pandas as pd from urllib.request import * import getCodeSets as codesets from pyspark.sql.functions import input_file_name spark = SparkSession.builder.appName('Load Weather Data').getOrCreate() weather_schema = types.StructType([ types.StructField('REF_DATE', types.StringType(), True), types.StructField('Year', types.StringType(), True), types.StructField('Month', types.StringType(), True), types.StructField('Mean_Max_Temp', types.StringType(), True), types.StructField('Mean_Max_Temp_Flag', types.StringType(), True), types.StructField('Mean_Min_Temp', types.StringType(), True), types.StructField('Mean_Min_Temp_Flag', types.StringType(), True), types.StructField('Mean_Temp', types.StringType(), True), types.StructField('Mean_Temp_Flag', types.StringType(), True), types.StructField('Extr_Max_Temp', types.StringType(), True), types.StructField('Extr_Max_Temp_Flag', types.StringType(), True), types.StructField('Extr_Min_Temp', types.StringType(), True), types.StructField('Extr_Min_Temp_Flag', types.StringType(), True), types.StructField('Total_Rain', types.StringType(), True), types.StructField('Total_Rain_Flag', types.StringType(), True), types.StructField('Total_Snow', types.StringType(), True),
def checkPath(path): proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path]) proc.communicate() if proc.returncode != 0: print('PATH DOES NOT EXIST') return False else: print('PATH EXISTS') return True # Schema infoSchema = T.StructType([ T.StructField('vid', T.StringType(), nullable=False), T.StructField('liked', T.BooleanType(), nullable=False), T.StructField('viewed', T.BooleanType(), nullable=False), T.StructField('list', T.BooleanType(), nullable=False) ]) # Updaters def likeVideo(uid, profile, vid): profile_path = f'hdfs:///home/users/{uid}/profiles/{profile}' # Check if path exists path_exists = checkPath(profile_path) if (not path_exists): cols = ['vid', 'liked', 'viewed', 'list'] df = spark.createDataFrame(