def main(): print (f"""Getting average yearly prices per region for all""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nStarted at");uf.println(lst) wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname') house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable']) house_df.printSchema() house_df.show(2, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('Date', 'yyyy').cast("Integer").alias('year') \ , 'regionname' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('Date', asending=True) df2.show(20,False) s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable']) print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""") lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nFinished at");uf.println(lst)
def main(): print(f"""Getting average yearly prices per region for all""") # read data through jdbc from Oracle appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"), 'regionname') tableName = config['OracleVariables']['sourceTable'] fullyQualifiedTableName = config['OracleVariables'][ 'dbschema'] + '.' + tableName print("reading from Oracle table") house_df = s.loadTableFromOracleJDBC(spark, fullyQualifiedTableName) house_df.printSchema() house_df.show(5, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('datetaken','yyyy').cast("Integer").alias('YEAR') \ , 'REGIONNAME' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPRICEPERYEAR') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFLATPRICEPERYEAR') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTERRACEDPRICEPERYEAR') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSDPRICEPRICEPERYEAR') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDETACHEDPRICEPERYEAR')). \ distinct().orderBy('datetaken', asending=True) df2.printSchema() df2.show(20, False) # write to Oracle table, all uppercase not mixed case and column names <= 30 characters in version 12.1 s.writeTableToOracle( df2, "overwrite", config['OracleVariables']['dbschema'], config['OracleVariables']['yearlyAveragePricesAllTable']) print( f"""created {config['OracleVariables']['yearlyAveragePricesAllTable']}""" ) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Creating Yearly percentage tables for {regionname}""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() # # Get data from BigQuery table tableName = "yearlyaveragepricesAllTable" start_date = "2010" end_date = "2020" yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) read_df = s.loadTableFromBQ( spark, config['GCPVariables']['targetDataset'], config['GCPVariables']['yearlyAveragePricesAllTable']) house_df = read_df.filter( (col("Year").between(f'{start_date}', f'{end_date}')) & (lower(col("regionname")) == f'{regionname}'.lower())) wSpecPY = Window().orderBy('regionname', 'Year') df_lagY = house_df.withColumn( "prev_year_value", F.lag(house_df['AVGPricePerYear']).over(wSpecPY)) resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \ otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1))) print(f"""\nYear House price changes in {regionname} in GBP""") rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value', 'percent_change') rsY.show(36, False) s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'], yearTable) print(f"""Created {yearTable}""") lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): print(f"""Getting average yearly prices per region for all""") regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Getting Yearly percentages tables for {regionname}""") appName = "ukhouseprices" spark = s.spark_session(appName) # Get data from BigQuery table tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) print("\nStarted at") uf.println(lst) read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'], tableName) summary_df = read_df.select( col("Year"), col("percent_change").alias("PercentYearlyChange")) p_df = summary_df.toPandas() print(p_df) p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange']) plt.xlabel("Year", fontdict=config['plot_fonts']['font']) plt.ylabel("Annual Percent Property Price change", fontdict=config['plot_fonts']['font']) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=config['plot_fonts']['font']) plt.margins(0.15) plt.subplots_adjust(bottom=0.25) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
from conf import parameters as v appName = "ukhouseprices" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName="ukhouseprices" fullyQualifiedTableName = v.DSDB+'.'+tableName summaryTableName = v.DSDB+'.'+'summary' start_date = "2010-01-01" end_date = "2020-01-01" lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nStarted at");uf.println(lst) if (spark.sql(f"""SHOW TABLES IN {v.DSDB} like '{tableName}'""").count() == 1): spark.sql(f"""ANALYZE TABLE {fullyQualifiedTableName} compute statistics""") rows = spark.sql(f"""SELECT COUNT(1) FROM {fullyQualifiedTableName}""").collect()[0][0] print("Total number of rows is ",rows) else: print(f"""No such table {fullyQualifiedTableName}""") sys.exit(1) f""" https://stackoverflow.com/questions/59278835/pyspark-how-to-write-dataframe-partition-by-year-month-day-hour-sub-directory """ wSpecY = Window().partitionBy(F.date_format('datetaken',"yyyy")) house_df = spark.sql(f"""select * from {fullyQualifiedTableName} where regionname = '{regionname}'""") rows = spark.sql(f"""SELECT COUNT(1) FROM {fullyQualifiedTableName} where regionname = '{regionname}'""").collect()[0][0] print(f"Total number of rows for {regionname} is ", rows)
spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName = "ukhouseprices" fullyQualifiedTableName = v.DSDB + '.' + tableName summaryTableName = v.DSDB + '.' + 'summary' start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") summary_df = spark.sql( f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""" ) df_10 = summary_df.filter( col("datetaken").between(f'{start_date}', f'{end_date}')) print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and deploy it model = LorentzianModel() n = len(p_dfm.columns)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Getting plots for {regionname}""") appName = "ukhouseprices" spark = s.spark_session(appName) sc = s.sparkcontext() # # Get data from BigQuery table summaryTableName = v.fullyQualifiedoutputTableId start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # read data from the Bigquery table summary print("\nreading data from " + v.fullyQualifiedoutputTableId) summary_df = spark.read. \ format("bigquery"). \ option("credentialsFile",v.jsonKeyFile). \ option("project", v.projectId). \ option("parentProject", v.projectId). \ option("dataset", v.targetDataset). \ option("table", v.targetTable). \ load() df_10 = summary_df.filter(F.col("Date").between(f'{start_date}', f'{end_date}')). \ select(F.date_format('Date',"yyyyMM").cast("Integer").alias("date"), 'flatprice', 'terracedprice', 'semidetachedprice', 'detachedprice') df_10.printSchema() print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and deploy it model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if p_dfm.columns[i] != "date": # yyyyMM is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['date']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['date']) result.plot_fit() plt.margins(0.15) plt.subplots_adjust(bottom=0.25) plt.xticks(rotation=90) plt.xlabel("year/month", fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=9) plt.xlim(left=200900) plt.xlim(right=202100) if vcolumn == "flatprice": property = "Flat" if vcolumn == "terracedprice": property = "Terraced" if vcolumn == "semidetachedprice": property = "semi-detached" if vcolumn == "detachedprice": property = "detached" plt.ylabel(f"""{property} house prices in millions/GBP""", fontdict=v.font) plt.title( f"""Monthly {property} prices fluctuations in {regionname}""", fontdict=v.font) print(result.fit_report()) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() appName = config['common']['appName'] spark = s.spark_session(appName) spark = s.setSparkConfBQ(spark) # Get data from BigQuery table start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'], config['GCPVariables']['sourceTable']) df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \ select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \ , round(col("flatprice")).alias("flatprice") \ , round(col("terracedprice")).alias("terracedprice") , round(col("semidetachedprice")).alias("semidetachedprice") , round(col("detachedprice").alias("detachedprice"))) print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and depoly it model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if (p_dfm.columns[i] != 'Date'): # yyyyMM is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['Date']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date']) # plot the data points, initial fit and the best fit plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data') plt.plot(p_dfm['Date'], result.init_fit, 'k--', label='initial fit') plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit') plt.legend(loc='upper left') plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font']) plt.text(0.35, 0.55, "Fit Based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=9) if vcolumn == "flatprice": property = "Flat" if vcolumn == "terracedprice": property = "Terraced" if vcolumn == "semidetachedprice": property = "semi-detached" if vcolumn == "detachedprice": property = "detached" plt.ylabel(f"""{property} house prices in millions/GBP""", fontdict=config['plot_fonts']['font']) plt.title( f"""Monthly {property} price fluctuations in {regionname}""", fontdict=config['plot_fonts']['font']) plt.xlim(200901, 202101) print(result.fit_report()) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) print( f"""Reading from parquet file {config['ParquetVariables']['sourceSmall']}""" ) # read from the source file currentSnapshot = spark.read.load( config['ParquetVariables']['sourceSmall']) currentSnapshot.printSchema() #currentSnapshot.show() print(f"""\nRows in source file is""", currentSnapshot.count()) print(currentSnapshot.rdd.getStorageLevel()) currentSnapshot = currentSnapshot.repartition(5) print(currentSnapshot.rdd.getStorageLevel()) # read from delta files deltaFile = "gs://etcbucket/randomdata/staging/randomdatapy_208150201_208150210" newAddedDeltaFiles = spark.read.load(deltaFile) # check missing records with source file # find out IDs that do not exist in source newAddedDeltaFiles.createOrReplaceTempView("newAddedDeltaFiles") currentSnapshot.createOrReplaceTempView("currentSnapshot") sqltext = """SELECT newAddedDeltaFiles.ID , newAddedDeltaFiles.CLUSTERED , newAddedDeltaFiles.SCATTERED , newAddedDeltaFiles.RANDOMISED , newAddedDeltaFiles.RANDOM_STRING , newAddedDeltaFiles.SMALL_VC , newAddedDeltaFiles.PADDING FROM newAddedDeltaFiles LEFT OUTER JOIN currentSnapshot ON newAddedDeltaFiles.ID = currentSnapshot.ID WHERE currentSnapshot.ID IS NULL ORDER BY newAddedDeltaFiles.ID""" print(f"""\nRows in deltafiles that do not exist in source file""", currentSnapshot.count()) missingRows = spark.sql(sqltext) newSnapshot = currentSnapshot.union(missingRows) print(newSnapshot.orderBy(col("ID")).show(10000)) sys.exit() #spark.sql(sqltext).write.mode(saveMode) print( f"""Writing to parquet file {config['ParquetVariables']['targetLocation']}""" ) df2.write.mode(config['ParquetVariables']['overwrite']).parquet( config['ParquetVariables']['targetLocation']) df3 = spark.read.load(config['ParquetVariables']['targetLocation']) print( f"""Reading from parquet file {config['ParquetVariables']['targetLocation']}""" ) print(f"""\nRows in target table is""", df3.count()) lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): appName = "ukhouseprices" spark = s.spark_session(appName) spark.sparkContext._conf.setAll(v.settings) sc = s.sparkcontext() # # Get data from Hive table regionname = "Kensington and Chelsea" tableName = "ukhouseprices" fullyQualifiedTableName = v.DSDB + "." + tableName summaryTableName = v.DSDB + "." + "summary" start_date = "2010" end_date = "2020" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nStarted at") uf.println(lst) # Model predictions spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""") summary_df = spark.sql( f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices""" ) df_10 = summary_df.filter( col("year").between(f'{start_date}', f'{end_date}')) print(df_10.toPandas().columns.tolist()) # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear'] p_dfm = df_10.toPandas() # converting spark DF to Pandas DF data = p_dfm.values # Non-Linear Least-Squares Minimization and Curve Fitting model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if p_dfm.columns[i] != 'year': # year is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['year']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year']) result.plot_fit() # do linear regression here # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]): inputCols = ['year'] vectorAssembler = VectorAssembler(inputCols=inputCols, outputCol='features') vhouse_df = vectorAssembler.transform(df_10) vhouse_df = vhouse_df.select( ['features', 'AVGFlatPricePerYear']) vhouse_df.show(20) if vcolumn == "AVGFlatPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Flat house prices in millions/GBP", fontdict=v.font) plt.title( f"""Flat price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.xlim(left=2009) plt.xlim(right=2022) plt.show() plt.close() elif vcolumn == "AVGTerracedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("Terraced house prices in millions/GBP", fontdict=v.font) plt.title( f"""Terraced house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGSemiDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("semi-detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""semi-detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() elif vcolumn == "AVGDetachedPricePerYear": plt.xlabel("Year", fontdict=v.font) plt.ylabel("detached house prices in millions/GBP", fontdict=v.font) plt.title( f"""detached house price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.text(0.35, 0.45, "Best-fit based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=10) print(result.fit_report()) plt.show() plt.close() p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear').toPandas().describe() print(p_df) #axs = scatter_matrix(p_df, figsize=(10, 10)) # Describe returns a DF where count,mean, min, std,max... are values of the index y = p_df.loc[['min', 'mean', 'max']] #y = p_df.loc[['averageprice', 'flatprice']] ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20) plt.grid(True) plt.xlabel("UK House Price Index, January 2020", fontdict=v.font) plt.ylabel("Property Prices in millions/GBP", fontdict=v.font) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=v.font) plt.legend(p_df.columns) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') " )).collect() print("\nFinished at") uf.println(lst)
def main(): appName = "DS" spark = s.spark_session(appName) sc = s.sparkcontext() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) tmp_bucket = "tmp_storage_bucket/tmp" # Set the temporary storage location spark.conf.set("temporaryGcsBucket", v.tmp_bucket) spark.sparkContext.setLogLevel("ERROR") HadoopConf = sc._jsc.hadoopConfiguration() HadoopConf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") HadoopConf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") # needed filters start_date = "2010-01-01" end_date = "2020-01-01" spark.conf.set("GcpJsonKeyFile", v.jsonKeyFile) spark.conf.set("BigQueryProjectId", v.projectId) spark.conf.set("BigQueryDatasetLocation", v.datasetLocation) spark.conf.set("google.cloud.auth.service.account.enable", "true") spark.conf.set("fs.gs.project.id", v.projectId) spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") spark.conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") spark.conf.set("temporaryGcsBucket", v.tmp_bucket) sqltext = "" from pyspark.sql.window import Window # read data from the Bigquery table in staging area print("\nreading data from " + v.projectId + ":" + v.inputTable) read_df = spark.read. \ format("bigquery"). \ option("credentialsFile",v.jsonKeyFile). \ option("project", v.projectId). \ option("parentProject", v.projectId). \ option("dataset", v.targetDataset). \ option("table", v.targetTable). \ option("temporaryGcsBucket", v.tmp_bucket). \ load() summary_df == read_df.filter( (col("Year").between(f'{start_date}', f'{end_date}')) & (lower(col("regionname")) == f'{regionname}'.lower())) summary_df.printSchema() rows = summary_df.count() print("Total number of rows for Kensington and Chelsea is ", rows) wSpecY = Window().partitionBy(F.date_format('date', "yyyy")) df2 = summary_df. \ select( \ F.date_format(F.col("date"),'yyyy').alias('Year') \ , F.round(F.avg(F.col("averageprice")).over(wSpecY)).alias('AVGPricePerYear') \ , F.round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , F.round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , F.round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , F.round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('date', asending=True) df2.show(10, False) # Save the result set to a BigQuery table. Table is created if it does not exist print(f"""\nsaving data to {v.DSDB}.yearlyhouseprices""") df2. \ write. \ format("bigquery"). \ option("temporaryGcsBucket", v.tmp_bucket).\ mode("overwrite"). \ option("table", "DS.yearlyhouseprices"). \ save() """ summary_df. \ write. \ format("bigquery"). \ mode("overwrite"). \ option("table", v.fullyQualifiedoutputTableId). \ option("temporaryGcsBucket", v.tmp_bucket). \ save() """ lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)