def SendToBigQuery(df, batchId): if (len(df.take(1))) > 0: #df.printSchema() df.persist() # read from redis table spark_session = s.spark_session(config['common']['appName']) spark_session = s.setSparkConfBQ(spark_session) # read from BigQuery read_df = s.loadTableFromBQ(spark_session, config['MDVariables']['targetDataset'], config['MDVariables']['targetTable']) #read_df = s.loadTableFromRedis(spark_session, config['RedisVariables']['targetTable'], config['RedisVariables']['keyColumn']) # Write data to config['MDVariables']['targetTable'] in BigQuery # look for high value tickers for row in df.rdd.collect(): rowkey = row.rowkey ticker = row.ticker price = row.price values = bigQueryAverages(ticker, price, read_df) Average = values["average"] standardDeviation = values["standardDeviation"] lower = values["lower"] upper = values["upper"] if lower is not None and upper is not None: hvTicker = priceComparison(ticker, price, lower, upper) if (hvTicker == 1): writeHighValueData(df, rowkey) df.unpersist() else: print("DataFrame is empty")
def main(): print (f"""Getting average yearly prices per region for all""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nStarted at");uf.println(lst) wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname') house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable']) house_df.printSchema() house_df.show(2, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('Date', 'yyyy').cast("Integer").alias('year') \ , 'regionname' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('Date', asending=True) df2.show(20,False) s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable']) print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""") lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nFinished at");uf.println(lst)
def loadIntoBQTable(self, df2): # write to BigQuery table s.writeTableToBQ(df2, "overwrite", config['GCPVariables']['targetDataset'], config['GCPVariables']['yearlyAveragePricesAllTable']) print( f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""" ) # read data to ensure all loaded OK read_df = s.loadTableFromBQ( self.spark, config['GCPVariables']['targetDataset'], config['GCPVariables']['yearlyAveragePricesAllTable']) # check that all rows are there if df2.subtract(read_df).count() == 0: print("Data has been loaded OK to BQ table") else: print("Data could not be loaded to BQ table, quitting") sys.exit(1)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Creating Yearly percentage tables for {regionname}""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() # # Get data from BigQuery table tableName = "yearlyaveragepricesAllTable" start_date = "2010" end_date = "2020" yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) read_df = s.loadTableFromBQ( spark, config['GCPVariables']['targetDataset'], config['GCPVariables']['yearlyAveragePricesAllTable']) house_df = read_df.filter( (col("Year").between(f'{start_date}', f'{end_date}')) & (lower(col("regionname")) == f'{regionname}'.lower())) wSpecPY = Window().orderBy('regionname', 'Year') df_lagY = house_df.withColumn( "prev_year_value", F.lag(house_df['AVGPricePerYear']).over(wSpecPY)) resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \ otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1))) print(f"""\nYear House price changes in {regionname} in GBP""") rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value', 'percent_change') rsY.show(36, False) s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'], yearTable) print(f"""Created {yearTable}""") lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): print(f"""Getting average yearly prices per region for all""") regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() print(f"""Getting Yearly percentages tables for {regionname}""") appName = "ukhouseprices" spark = s.spark_session(appName) # Get data from BigQuery table tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}""" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() spark = s.setSparkConfBQ(spark) print("\nStarted at") uf.println(lst) read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'], tableName) summary_df = read_df.select( col("Year"), col("percent_change").alias("PercentYearlyChange")) p_df = summary_df.toPandas() print(p_df) p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange']) plt.xlabel("Year", fontdict=config['plot_fonts']['font']) plt.ylabel("Annual Percent Property Price change", fontdict=config['plot_fonts']['font']) plt.title( f"""Property price fluctuations in {regionname} for the past 10 years """, fontdict=config['plot_fonts']['font']) plt.margins(0.15) plt.subplots_adjust(bottom=0.25) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)
def main(): regionname = sys.argv[1] ## parameter passed short = regionname.replace(" ", "").lower() appName = config['common']['appName'] spark = s.spark_session(appName) spark = s.setSparkConfBQ(spark) # Get data from BigQuery table start_date = "201001" end_date = "202001" lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nStarted at") uf.println(lst) # Model predictions read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'], config['GCPVariables']['sourceTable']) df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \ select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \ , round(col("flatprice")).alias("flatprice") \ , round(col("terracedprice")).alias("terracedprice") , round(col("semidetachedprice")).alias("semidetachedprice") , round(col("detachedprice").alias("detachedprice"))) print(df_10.toPandas().columns.tolist()) p_dfm = df_10.toPandas() # converting spark DF to Pandas DF # Non-Linear Least-Squares Minimization and Curve Fitting # Define model to be Lorentzian and depoly it model = LorentzianModel() n = len(p_dfm.columns) for i in range(n): if (p_dfm.columns[i] != 'Date'): # yyyyMM is x axis in integer # it goes through the loop and plots individual average curves one by one and then prints a report for each y value vcolumn = p_dfm.columns[i] print(vcolumn) params = model.guess(p_dfm[vcolumn], x=p_dfm['Date']) result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date']) # plot the data points, initial fit and the best fit plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data') plt.plot(p_dfm['Date'], result.init_fit, 'k--', label='initial fit') plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit') plt.legend(loc='upper left') plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font']) plt.text(0.35, 0.55, "Fit Based on Non-Linear Lorentzian Model", transform=plt.gca().transAxes, color="grey", fontsize=9) if vcolumn == "flatprice": property = "Flat" if vcolumn == "terracedprice": property = "Terraced" if vcolumn == "semidetachedprice": property = "semi-detached" if vcolumn == "detachedprice": property = "detached" plt.ylabel(f"""{property} house prices in millions/GBP""", fontdict=config['plot_fonts']['font']) plt.title( f"""Monthly {property} price fluctuations in {regionname}""", fontdict=config['plot_fonts']['font']) plt.xlim(200901, 202101) print(result.fit_report()) plt.show() plt.close() lst = (spark.sql( "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ") ).collect() print("\nFinished at") uf.println(lst)