def main(): print (f"""Getting average yearly prices per region for all""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nStarted at");uf.println(lst) wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname') house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable']) house_df.printSchema() house_df.show(2, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('Date', 'yyyy').cast("Integer").alias('year') \ , 'regionname' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('Date', asending=True) df2.show(20,False) s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable']) print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""") lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nFinished at");uf.println(lst)
def SendToBigQuery(df, batchId): if (len(df.take(1))) > 0: #df.printSchema() df.persist() # read from redis table spark_session = s.spark_session(config['common']['appName']) spark_session = s.setSparkConfBQ(spark_session) # read from BigQuery read_df = s.loadTableFromBQ(spark_session, config['MDVariables']['targetDataset'], config['MDVariables']['targetTable']) #read_df = s.loadTableFromRedis(spark_session, config['RedisVariables']['targetTable'], config['RedisVariables']['keyColumn']) # Write data to config['MDVariables']['targetTable'] in BigQuery # look for high value tickers for row in df.rdd.collect(): rowkey = row.rowkey ticker = row.ticker price = row.price values = bigQueryAverages(ticker, price, read_df) Average = values["average"] standardDeviation = values["standardDeviation"] lower = values["lower"] upper = values["upper"] if lower is not None and upper is not None: hvTicker = priceComparison(ticker, price, lower, upper) if (hvTicker == 1): writeHighValueData(df, rowkey) df.unpersist() else: print("DataFrame is empty")
def extractHiveData(): print(f"""Getting average yearly prices per region for all""") # read data through jdbc from Hive spark_session = s.spark_session(ctest['common']['appName']) tableName = config['GCPVariables']['sourceTable'] fullyQualifiedTableName = config['hiveVariables']['DSDB'] + '.' + tableName print("reading from Hive table") house_df = s.loadTableFromHiveJDBC(spark_session, fullyQualifiedTableName) # sample data equally n rows from Kensington and Chelsea and n rows from City of Westminster num_rows = int(config['MysqlVariables']['read_df_rows'] / 2) house_df = house_df.filter(col( "regionname") == "Kensington and Chelsea").limit(num_rows).unionAll( house_df.filter( col("regionname") == "City of Westminster").limit(num_rows))
def main(): print(f"""Getting average yearly prices per region for all""") # read data through jdbc from Oracle appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"), 'regionname') tableName = config['OracleVariables']['sourceTable'] fullyQualifiedTableName = config['OracleVariables'][ 'dbschema'] + '.' + tableName print("reading from Oracle table") house_df = s.loadTableFromOracleJDBC(spark, fullyQualifiedTableName) house_df.printSchema() house_df.show(5, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('datetaken','yyyy').cast("Integer").alias('YEAR') \ , 'REGIONNAME' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPRICEPERYEAR') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFLATPRICEPERYEAR') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTERRACEDPRICEPERYEAR') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSDPRICEPRICEPERYEAR') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDETACHEDPRICEPERYEAR')). \ distinct().orderBy('datetaken', asending=True) df2.printSchema() df2.show(20, False) # write to Oracle table, all uppercase not mixed case and column names <= 30 characters in version 12.1 s.writeTableToOracle( df2, "overwrite", config['OracleVariables']['dbschema'], config['OracleVariables']['yearlyAveragePricesAllTable']) print( f"""created {config['OracleVariables']['yearlyAveragePricesAllTable']}""" ) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def sendToControl(dfnewtopic, batchId): if (len(dfnewtopic.take(1))) > 0: #print(f"""newtopic batchId is {batchId}""") dfnewtopic.show(100, False) queue = dfnewtopic.select(col("queue")).collect()[0][0] status = dfnewtopic.select(col("status")).collect()[0][0] #print(f"""{queue}, {status}""") if ((queue == config['MDVariables']['topic']) & (status == 'false')): spark_session = s.spark_session(config['common']['appName']) active = spark_session.streams.active for e in active: #print(e) name = e.name if (name == config['MDVariables']['topic']): print(f"""Terminating streaming process {name}""") e.stop() else: print("DataFrame newtopic is empty")
def readSourceData(): # read source table table = ctest['statics']['dbschema'] + '.' + ctest['statics']['sourceTable'] spark_session = s.spark_session(ctest['common']['appName']) # Read the test table try: read_df = spark_session.read. \ format("jdbc"). \ option("url", test_url). \ option("driver", ctest['statics']['driver']). \ option("dbtable", table). \ option("user", ctest['statics']['user']). \ option("password", ctest['statics']['password']). \ option("fetchsize", ctest['statics']['fetchsize']). \ load() return read_df except Exception as e: print(f"""{e}, quitting""") sys.exit(1)
def readSavedData(): # read target table to tally the result table = ctest['statics']['dbschema'] + '.' + ctest['statics'][ 'yearlyAveragePricesAllTable'] spark_session = s.spark_session(ctest['common']['appName']) try: readSavedData_df = spark_session.read. \ format("jdbc"). \ option("url", test_url). \ option("driver", ctest['statics']['driver']). \ option("dbtable", table). \ option("user", ctest['statics']['user']). \ option("password", ctest['statics']['password']). \ option("fetchsize", ctest['statics']['fetchsize']). \ load() return readSavedData_df except Exception as e: print(f"""{e}, quitting""") sys.exit(1)
class S1: appName = "app1" spark = s.spark_session(appName) sc = s.sparkcontext() df = spark.createDataFrame([("a", 0), ("a", 1), ("b", 30), ("b", -50)], ["group", "power"]) def below_threshold(threshold, group="group", power="power"): @pandas_udf("struct<group: string, below_threshold: boolean>", PandasUDFType.GROUPED_MAP) def below_threshold_(df): df = pd.DataFrame( df.groupby(group).apply(lambda x: (x[power] < threshold).any())) df.reset_index(inplace=True, drop=False) return df return below_threshold_ df.groupBy("group").apply(below_threshold(-40)).show()
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Creating Yearly percentage tables for {regionname}""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() # # Get data from BigQuery table tableName = "yearlyaveragepricesAllTable" start_date = "2010" end_date = "2020" yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) read_df = s.loadTableFromBQ( spark, config['GCPVariables']['targetDataset'], config['GCPVariables']['yearlyAveragePricesAllTable']) house_df = read_df.filter( (col("Year").between(f'{start_date}', f'{end_date}')) & (lower(col("regionname")) == f'{regionname}'.lower())) wSpecPY = Window().orderBy('regionname', 'Year') df_lagY = house_df.withColumn( "prev_year_value", F.lag(house_df['AVGPricePerYear']).over(wSpecPY)) resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \ otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1))) print(f"""\nYear House price changes in {regionname} in GBP""") rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value', 'percent_change') rsY.show(36, False) s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'], yearTable) print(f"""Created {yearTable}""") lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): print(f"""Getting average yearly prices per region for all""") regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Getting Yearly percentages tables for {regionname}""") appName = "ukhouseprices" spark = s.spark_session(appName) # Get data from BigQuery table tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) print("\nStarted at") uf.println(lst) read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'], tableName) summary_df = read_df.select( col("Year"), col("percent_change").alias("PercentYearlyChange")) p_df = summary_df.toPandas() print(p_df) p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange']) plt.xlabel("Year", fontdict=config['plot_fonts']['font']) plt.ylabel("Annual Percent Property Price change", fontdict=config['plot_fonts']['font']) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=config['plot_fonts']['font']) plt.margins(0.15) plt.subplots_adjust(bottom=0.25) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
from pyspark.sql.functions import lag from sparkutils import sparkstuff as s from misc import usedFunctions as uf from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import GBTRegressor import pandas as pd from pandas.plotting import scatter_matrix import locale locale.setlocale(locale.LC_ALL, 'en_GB') try: import variables as v except ModuleNotFoundError: from conf import parameters as v appName = "ukhouseprices" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName = "ukhouseprices" fullyQualifiedTableName = v.DSDB + '.' + tableName summaryTableName = v.DSDB + '.' + 'summary' start_date = "2010-01-01" end_date = "2020-01-01" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Getting plots for {regionname}""") appName = "ukhouseprices" spark = s.spark_session(appName) sc = s.sparkcontext() # # Get data from BigQuery table summaryTableName = v.fullyQualifiedoutputTableId start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # read data from the Bigquery table summary print("\nreading data from " + v.fullyQualifiedoutputTableId) summary_df = spark.read. \ format("bigquery"). \ option("credentialsFile",v.jsonKeyFile). \ option("project", v.projectId). \ option("parentProject", v.projectId). \ option("dataset", v.targetDataset). \ option("table", v.targetTable). \ load() df_10 = summary_df.filter(F.col("Date").between(f'{start_date}', f'{end_date}')). \ select(F.date_format('Date',"yyyyMM").cast("Integer").alias("date"), 'flatprice', 'terracedprice', 'semidetachedprice', 'detachedprice') df_10.printSchema() print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and deploy it model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if p_dfm.columns[i] != "date": # yyyyMM is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['date']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['date']) result.plot_fit() plt.margins(0.15) plt.subplots_adjust(bottom=0.25) plt.xticks(rotation=90) plt.xlabel("year/month", fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=9) plt.xlim(left=200900) plt.xlim(right=202100) if vcolumn == "flatprice": property = "Flat" if vcolumn == "terracedprice": property = "Terraced" if vcolumn == "semidetachedprice": property = "semi-detached" if vcolumn == "detachedprice": property = "detached" plt.ylabel(f"""{property} house prices in millions/GBP""", fontdict=v.font) plt.title( f"""Monthly {property} prices fluctuations in {regionname}""", fontdict=v.font) print(result.fit_report()) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): appName = "ukhouseprices" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName = "ukhouseprices" fullyQualifiedTableName = v.DSDB + "." + tableName summaryTableName = v.DSDB + "." + "summary" start_date = "2010" end_date = "2020" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""") summary_df = spark.sql( f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices""" ) df_10 = summary_df.filter( col("year").between(f'{start_date}', f'{end_date}')) print(df_10.toPandas().columns.tolist()) # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear'] p_dfm = df_10.toPandas() # converting spark DF to Pandas DF data = p_dfm.values # Non-Linear Least-Squares Minimization and Curve Fitting model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if p_dfm.columns[i] != 'year': # year is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['year']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year']) result.plot_fit() # do linear regression here # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]): inputCols = ['year'] vectorAssembler = VectorAssembler(inputCols=inputCols, outputCol='features') vhouse_df = vectorAssembler.transform(df_10) vhouse_df = vhouse_df.select( ['features', 'AVGFlatPricePerYear']) vhouse_df.show(20) if vcolumn == "AVGFlatPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Flat house prices in millions/GBP", fontdict=v.font) plt.title( f"""Flat price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.xlim(left=2009) plt.xlim(right=2022) plt.show() plt.close() elif vcolumn == "AVGTerracedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Terraced house prices in millions/GBP", fontdict=v.font) plt.title( f"""Terraced house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGSemiDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("semi-detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""semi-detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear').toPandas().describe() print(p_df) #axs = scatter_matrix(p_df, figsize=(10, 10)) # Describe returns a DF where count,mean, min, std,max... are values of the index y = p_df.loc[['min', 'mean', 'max']] #y = p_df.loc[['averageprice', 'flatprice']] ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20) plt.grid(True) plt.xlabel("UK House Price Index, January 2020", fontdict=v.font) plt.ylabel("Property Prices in millions/GBP", fontdict=v.font) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.legend(p_df.columns) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nFinished at") uf.println(lst)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() appName = config['common']['appName'] spark = s.spark_session(appName) spark = s.setSparkConfBQ(spark) # Get data from BigQuery table start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'], config['GCPVariables']['sourceTable']) df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \ select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \ , round(col("flatprice")).alias("flatprice") \ , round(col("terracedprice")).alias("terracedprice") , round(col("semidetachedprice")).alias("semidetachedprice") , round(col("detachedprice").alias("detachedprice"))) print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and depoly it model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if (p_dfm.columns[i] != 'Date'): # yyyyMM is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['Date']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date']) # plot the data points, initial fit and the best fit plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data') plt.plot(p_dfm['Date'], result.init_fit, 'k--', label='initial fit') plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit') plt.legend(loc='upper left') plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font']) plt.text(0.35, 0.55, "Fit Based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=9) if vcolumn == "flatprice": property = "Flat" if vcolumn == "terracedprice": property = "Terraced" if vcolumn == "semidetachedprice": property = "semi-detached" if vcolumn == "detachedprice": property = "detached" plt.ylabel(f"""{property} house prices in millions/GBP""", fontdict=config['plot_fonts']['font']) plt.title( f"""Monthly {property} price fluctuations in {regionname}""", fontdict=config['plot_fonts']['font']) plt.xlim(200901, 202101) print(result.fit_report()) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): appName = "DS" spark = s.spark_session(appName) sc = s.sparkcontext() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) tmp_bucket = "tmp_storage_bucket/tmp" # Set the temporary storage location spark.conf.set("temporaryGcsBucket", v.tmp_bucket) spark.sparkContext.setLogLevel("ERROR") HadoopConf = sc._jsc.hadoopConfiguration() HadoopConf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") HadoopConf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") # needed filters start_date = "2010-01-01" end_date = "2020-01-01" spark.conf.set("GcpJsonKeyFile", v.jsonKeyFile) spark.conf.set("BigQueryProjectId", v.projectId) spark.conf.set("BigQueryDatasetLocation", v.datasetLocation) spark.conf.set("google.cloud.auth.service.account.enable", "true") spark.conf.set("fs.gs.project.id", v.projectId) spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") spark.conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") spark.conf.set("temporaryGcsBucket", v.tmp_bucket) sqltext = "" from pyspark.sql.window import Window # read data from the Bigquery table in staging area print("\nreading data from " + v.projectId + ":" + v.inputTable) read_df = spark.read. \ format("bigquery"). \ option("credentialsFile",v.jsonKeyFile). \ option("project", v.projectId). \ option("parentProject", v.projectId). \ option("dataset", v.targetDataset). \ option("table", v.targetTable). \ option("temporaryGcsBucket", v.tmp_bucket). \ load() summary_df == read_df.filter( (col("Year").between(f'{start_date}', f'{end_date}')) & (lower(col("regionname")) == f'{regionname}'.lower())) summary_df.printSchema() rows = summary_df.count() print("Total number of rows for Kensington and Chelsea is ", rows) wSpecY = Window().partitionBy(F.date_format('date', "yyyy")) df2 = summary_df. \ select( \ F.date_format(F.col("date"),'yyyy').alias('Year') \ , F.round(F.avg(F.col("averageprice")).over(wSpecY)).alias('AVGPricePerYear') \ , F.round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , F.round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , F.round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , F.round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('date', asending=True) df2.show(10, False) # Save the result set to a BigQuery table. Table is created if it does not exist print(f"""\nsaving data to {v.DSDB}.yearlyhouseprices""") df2. \ write. \ format("bigquery"). \ option("temporaryGcsBucket", v.tmp_bucket).\ mode("overwrite"). \ option("table", "DS.yearlyhouseprices"). \ save() """ summary_df. \ write. \ format("bigquery"). \ mode("overwrite"). \ option("table", v.fullyQualifiedoutputTableId). \ option("temporaryGcsBucket", v.tmp_bucket). \ save() """ lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): appName = "app1" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() print(sc.getConf().getAll()) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nStarted at") uf.println(lst) numRows = 10 ## do in increment of 50K rows otherwise you blow up driver memory! # ## Check if table exist otherwise create it rows = 0 sqltext = "" if (spark.sql(f"""SHOW TABLES IN {v.DB} like '{v.tableName}'""").count( ) == 1): spark.sql( f"""ANALYZE TABLE {v.fullyQualifiedTableName} compute statistics""" ) rows = spark.sql( f"""SELECT COUNT(1) FROM {v.fullyQualifiedTableName}""" ).collect()[0][0] print("number of rows is ", rows) else: print( f"\nTable {v.fullyQualifiedTableName} does not exist, creating table " ) sqltext = f""" CREATE TABLE {v.DB}.{v.tableName}( ID INT , CLUSTERED INT , SCATTERED INT , RANDOMISED INT , RANDOM_STRING VARCHAR(50) , SMALL_VC VARCHAR(50) , PADDING VARCHAR(4000) ) STORED AS PARQUET """ spark.sql(sqltext) start = 0 if (rows == 0): start = 1 maxID = 0 else: maxID = spark.sql( f"SELECT MAX(id) FROM {v.fullyQualifiedTableName}").collect( )[0][0] start = maxID + 1 end = start + numRows - 1 print("starting at ID = ", start, ",ending on = ", end) Range = range(start, end + 1) ## This traverses through the Range and increment "x" by one unit each time, and that x value is used in the code to generate random data through Python functions in a class rdd = sc.parallelize(Range). \ map(lambda x: (x, uf.clustered(x,numRows), \ uf.scattered(x,numRows), \ uf.randomised(x, numRows), \ uf.randomString(50), \ uf.padString(x," ",50), \ uf.padSingleChar("x",4000))) df = rdd.toDF(). \ withColumnRenamed("_1","ID"). \ withColumnRenamed("_2", "CLUSTERED"). \ withColumnRenamed("_3", "SCATTERED"). \ withColumnRenamed("_4", "RANDOMISED"). \ withColumnRenamed("_5", "RANDOM_STRING"). \ withColumnRenamed("_6", "SMALL_VC"). \ withColumnRenamed("_7", "PADDING") df.write.mode("overwrite").saveAsTable("pycharm.ABCD") df.printSchema() df.explain() df.createOrReplaceTempView("tmp") sqltext = f""" INSERT INTO TABLE {v.fullyQualifiedTableName} SELECT ID , CLUSTERED , SCATTERED , RANDOMISED , RANDOM_STRING , SMALL_VC , PADDING FROM tmp """ spark.sql(sqltext) spark.sql( f"SELECT MIN(id) AS minID, MAX(id) AS maxID FROM {v.fullyQualifiedTableName}" ).show(n=20, truncate=False, vertical=False) ##sqlContext.sql("""SELECT * FROM pycharm.randomDataPy ORDER BY id""").show(n=20,truncate=False,vertical=False) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nFinished at") uf.println(lst) spark.sql("show databases").show()
def main(): appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) print( f"""Reading from parquet file {config['ParquetVariables']['sourceSmall']}""" ) # read from the source file currentSnapshot = spark.read.load( config['ParquetVariables']['sourceSmall']) currentSnapshot.printSchema() #currentSnapshot.show() print(f"""\nRows in source file is""", currentSnapshot.count()) print(currentSnapshot.rdd.getStorageLevel()) currentSnapshot = currentSnapshot.repartition(5) print(currentSnapshot.rdd.getStorageLevel()) # read from delta files deltaFile = "gs://etcbucket/randomdata/staging/randomdatapy_208150201_208150210" newAddedDeltaFiles = spark.read.load(deltaFile) # check missing records with source file # find out IDs that do not exist in source newAddedDeltaFiles.createOrReplaceTempView("newAddedDeltaFiles") currentSnapshot.createOrReplaceTempView("currentSnapshot") sqltext = """SELECT newAddedDeltaFiles.ID , newAddedDeltaFiles.CLUSTERED , newAddedDeltaFiles.SCATTERED , newAddedDeltaFiles.RANDOMISED , newAddedDeltaFiles.RANDOM_STRING , newAddedDeltaFiles.SMALL_VC , newAddedDeltaFiles.PADDING FROM newAddedDeltaFiles LEFT OUTER JOIN currentSnapshot ON newAddedDeltaFiles.ID = currentSnapshot.ID WHERE currentSnapshot.ID IS NULL ORDER BY newAddedDeltaFiles.ID""" print(f"""\nRows in deltafiles that do not exist in source file""", currentSnapshot.count()) missingRows = spark.sql(sqltext) newSnapshot = currentSnapshot.union(missingRows) print(newSnapshot.orderBy(col("ID")).show(10000)) sys.exit() #spark.sql(sqltext).write.mode(saveMode) print( f"""Writing to parquet file {config['ParquetVariables']['targetLocation']}""" ) df2.write.mode(config['ParquetVariables']['overwrite']).parquet( config['ParquetVariables']['targetLocation']) df3 = spark.read.load(config['ParquetVariables']['targetLocation']) print( f"""Reading from parquet file {config['ParquetVariables']['targetLocation']}""" ) print(f"""\nRows in target table is""", df3.count()) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
class Sales: appName = "sales" spark = s.spark_session(appName) settings = [("hive.exec.dynamic.partition", "true"), ("hive.exec.dynamic.partition.mode", "nonstrict"), ("spark.sql.orc.filterPushdown", "true"), ("hive.msck.path.validation", "ignore"), ("spark.sql.caseSensitive", "true"), ("spark.speculation", "false"), ("hive.metastore.authorization.storage.checks", "false"), ("hive.metastore.client.connect.retry.delay", "5s"), ("hive.metastore.client.socket.timeout", "1800s"), ("hive.metastore.connect.retries", "12"), ("hive.metastore.execute.setugi", "false"), ("hive.metastore.failure.retries", "12"), ("hive.metastore.schema.verification", "false"), ("hive.metastore.schema.verification.record.version", "false"), ("hive.metastore.server.max.threads", "100000"), ("hive.metastore.authorization.storage.checks", "/apps/hive/warehouse"), ("hive.stats.autogather", "true")] spark.sparkContext._conf.setAll(settings) sc = s.sparkcontext() #print(sc.getConf().getAll()) hivecontext = s.hivecontext() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) rows = spark.sql( f"""SELECT COUNT(1) FROM {v.DB2}.{v.table2}""").collect()[0][0] sqltext = f""" SELECT rs.Customer_ID , rs.Number_of_orders , rs.Total_customer_amount , rs.Average_order , rs.Standard_deviation , rs.mystddev FROM ( SELECT cust_id AS Customer_ID , COUNT(amount_sold) AS Number_of_orders , SUM(amount_sold) AS Total_customer_amount , AVG(amount_sold) AS Average_order , STDDEV(amount_sold) AS Standard_deviation , SQRT((SUM(POWER(AMOUNT_SOLD,2))-(COUNT(1)*POWER(AVG(AMOUNT_SOLD),2)))/(COUNT(1)-1)) AS mystddev FROM {v.DB2}.{v.table2} GROUP BY cust_id HAVING SUM(amount_sold) > 94000 AND AVG(amount_sold) < STDDEV(amount_sold) ) rs ORDER BY 3 DESC """ spark.sql(sqltext).show(1000, False) df = spark.sql(sqltext) df.printSchema() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)