Beispiel #1
0
def SendToBigQuery(df, batchId):

    if (len(df.take(1))) > 0:
        #df.printSchema()
        df.persist()
        # read from redis table
        spark_session = s.spark_session(config['common']['appName'])
        spark_session = s.setSparkConfBQ(spark_session)
        # read from BigQuery
        read_df = s.loadTableFromBQ(spark_session,
                                    config['MDVariables']['targetDataset'],
                                    config['MDVariables']['targetTable'])
        #read_df = s.loadTableFromRedis(spark_session, config['RedisVariables']['targetTable'], config['RedisVariables']['keyColumn'])
        # Write data to config['MDVariables']['targetTable'] in BigQuery
        # look for high value tickers
        for row in df.rdd.collect():
            rowkey = row.rowkey
            ticker = row.ticker
            price = row.price
            values = bigQueryAverages(ticker, price, read_df)
            Average = values["average"]
            standardDeviation = values["standardDeviation"]
            lower = values["lower"]
            upper = values["upper"]
            if lower is not None and upper is not None:
                hvTicker = priceComparison(ticker, price, lower, upper)
                if (hvTicker == 1):
                    writeHighValueData(df, rowkey)
        df.unpersist()
    else:
        print("DataFrame is empty")
def main():
    print (f"""Getting average yearly prices per region for all""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nStarted at");uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname')
    house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable'])
    house_df.printSchema()
    house_df.show(2, False)

    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('Date', 'yyyy').cast("Integer").alias('year') \
                        , 'regionname' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('Date', asending=True)
    df2.show(20,False)
    s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable'])
    print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""")
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nFinished at");uf.println(lst)
Beispiel #3
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Creating Yearly percentage tables for {regionname}""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #    # Get data from BigQuery table
    tableName = "yearlyaveragepricesAllTable"
    start_date = "2010"
    end_date = "2020"
    yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    read_df = s.loadTableFromBQ(
        spark, config['GCPVariables']['targetDataset'],
        config['GCPVariables']['yearlyAveragePricesAllTable'])
    house_df = read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    wSpecPY = Window().orderBy('regionname', 'Year')
    df_lagY = house_df.withColumn(
        "prev_year_value",
        F.lag(house_df['AVGPricePerYear']).over(wSpecPY))
    resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \
                             otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1)))
    print(f"""\nYear House price changes in {regionname} in GBP""")
    rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value',
                         'percent_change')
    rsY.show(36, False)
    s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'],
                     yearTable)
    print(f"""Created {yearTable}""")
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
Beispiel #4
0
def main():
    print(f"""Getting average yearly prices per region for all""")
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting Yearly percentages tables for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    # Get data from BigQuery table
    tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    print("\nStarted at")
    uf.println(lst)
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'],
                                tableName)
    summary_df = read_df.select(
        col("Year"),
        col("percent_change").alias("PercentYearlyChange"))
    p_df = summary_df.toPandas()
    print(p_df)
    p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange'])
    plt.xlabel("Year", fontdict=config['plot_fonts']['font'])
    plt.ylabel("Annual Percent Property Price change",
               fontdict=config['plot_fonts']['font'])
    plt.title(
        f"""Property price fluctuations in {regionname} for the past 10 years """,
        fontdict=config['plot_fonts']['font'])
    plt.margins(0.15)
    plt.subplots_adjust(bottom=0.25)
    plt.show()
    plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
Beispiel #5
0
                     foreachBatch(sendToSink). \
                     trigger(processingTime='30 seconds'). \
                     option('checkpointLocation', checkpoint_path). \
                     queryName(config['MDVariables']['topic']). \
                     start()
            print(result)

        except Exception as e:
            print(f"""{e}, quitting""")
            sys.exit(1)

        #print(result.status)
        #print(result.recentProgress)
        #print(result.lastProgress)

        self.spark.streams.awaitAnyTermination()
        result.awaitTermination()
        #newtopicResult.awaitTermination()


if __name__ == "__main__":

    #appName = config['common']['appName']
    appName = "batch"
    spark_session = s.spark_session(appName)
    spark_session = s.setSparkConfStreaming(spark_session)
    spark_session = s.setSparkConfBQ(spark_session)
    spark_context = s.sparkcontext()
    mdstreaming = MDStreaming(spark_session, spark_context)
    streamingDataFrame = mdstreaming.fetch_data()
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    spark = s.setSparkConfBQ(spark)
    # Get data from BigQuery table
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'],
                                config['GCPVariables']['sourceTable'])
    df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \
            select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \
                 , round(col("flatprice")).alias("flatprice") \
                 , round(col("terracedprice")).alias("terracedprice")
                 , round(col("semidetachedprice")).alias("semidetachedprice")
                 , round(col("detachedprice").alias("detachedprice")))
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF

    # Non-Linear Least-Squares Minimization and Curve Fitting
    # Define model to be Lorentzian and depoly it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if (p_dfm.columns[i] != 'Date'):  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['Date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date'])
            # plot the data points, initial fit and the best fit
            plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data')
            plt.plot(p_dfm['Date'],
                     result.init_fit,
                     'k--',
                     label='initial fit')
            plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit')
            plt.legend(loc='upper left')
            plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font'])
            plt.text(0.35,
                     0.55,
                     "Fit Based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=config['plot_fonts']['font'])
            plt.title(
                f"""Monthly {property} price fluctuations in {regionname}""",
                fontdict=config['plot_fonts']['font'])
            plt.xlim(200901, 202101)
            print(result.fit_report())
            plt.show()
            plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
def main():
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    print(
        f"""Reading from parquet file {config['ParquetVariables']['sourceSmall']}"""
    )
    # read from the source file
    currentSnapshot = spark.read.load(
        config['ParquetVariables']['sourceSmall'])
    currentSnapshot.printSchema()
    #currentSnapshot.show()
    print(f"""\nRows in source file is""", currentSnapshot.count())
    print(currentSnapshot.rdd.getStorageLevel())
    currentSnapshot = currentSnapshot.repartition(5)
    print(currentSnapshot.rdd.getStorageLevel())
    # read from delta files
    deltaFile = "gs://etcbucket/randomdata/staging/randomdatapy_208150201_208150210"
    newAddedDeltaFiles = spark.read.load(deltaFile)
    # check missing records with source file
    # find out IDs that do not exist in source
    newAddedDeltaFiles.createOrReplaceTempView("newAddedDeltaFiles")
    currentSnapshot.createOrReplaceTempView("currentSnapshot")
    sqltext = """SELECT
                     newAddedDeltaFiles.ID
                   , newAddedDeltaFiles.CLUSTERED
                   , newAddedDeltaFiles.SCATTERED
                   , newAddedDeltaFiles.RANDOMISED
                   , newAddedDeltaFiles.RANDOM_STRING
                   , newAddedDeltaFiles.SMALL_VC
                   , newAddedDeltaFiles.PADDING 
                 FROM newAddedDeltaFiles 
                 LEFT OUTER JOIN currentSnapshot ON newAddedDeltaFiles.ID = currentSnapshot.ID
                 WHERE currentSnapshot.ID IS NULL ORDER BY newAddedDeltaFiles.ID"""
    print(f"""\nRows in deltafiles that do not exist in source file""",
          currentSnapshot.count())
    missingRows = spark.sql(sqltext)
    newSnapshot = currentSnapshot.union(missingRows)
    print(newSnapshot.orderBy(col("ID")).show(10000))
    sys.exit()
    #spark.sql(sqltext).write.mode(saveMode)
    print(
        f"""Writing to parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    df2.write.mode(config['ParquetVariables']['overwrite']).parquet(
        config['ParquetVariables']['targetLocation'])
    df3 = spark.read.load(config['ParquetVariables']['targetLocation'])
    print(
        f"""Reading from parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    print(f"""\nRows in target table is""", df3.count())
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)