コード例 #1
0
def main():
    print (f"""Getting average yearly prices per region for all""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nStarted at");uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname')
    house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable'])
    house_df.printSchema()
    house_df.show(2, False)

    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('Date', 'yyyy').cast("Integer").alias('year') \
                        , 'regionname' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('Date', asending=True)
    df2.show(20,False)
    s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable'])
    print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""")
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nFinished at");uf.println(lst)
コード例 #2
0
def main():
    print(f"""Getting average yearly prices per region for all""")
    # read data through jdbc from Oracle

    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"),
                                  'regionname')
    tableName = config['OracleVariables']['sourceTable']
    fullyQualifiedTableName = config['OracleVariables'][
        'dbschema'] + '.' + tableName
    print("reading from Oracle table")
    house_df = s.loadTableFromOracleJDBC(spark, fullyQualifiedTableName)
    house_df.printSchema()
    house_df.show(5, False)
    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('datetaken','yyyy').cast("Integer").alias('YEAR') \
                        , 'REGIONNAME' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPRICEPERYEAR') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFLATPRICEPERYEAR') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTERRACEDPRICEPERYEAR') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSDPRICEPRICEPERYEAR') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDETACHEDPRICEPERYEAR')). \
                    distinct().orderBy('datetaken', asending=True)
    df2.printSchema()
    df2.show(20, False)
    # write to Oracle table, all uppercase not mixed case and column names <= 30 characters in version 12.1
    s.writeTableToOracle(
        df2, "overwrite", config['OracleVariables']['dbschema'],
        config['OracleVariables']['yearlyAveragePricesAllTable'])
    print(
        f"""created {config['OracleVariables']['yearlyAveragePricesAllTable']}"""
    )
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #3
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Creating Yearly percentage tables for {regionname}""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #    # Get data from BigQuery table
    tableName = "yearlyaveragepricesAllTable"
    start_date = "2010"
    end_date = "2020"
    yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    read_df = s.loadTableFromBQ(
        spark, config['GCPVariables']['targetDataset'],
        config['GCPVariables']['yearlyAveragePricesAllTable'])
    house_df = read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    wSpecPY = Window().orderBy('regionname', 'Year')
    df_lagY = house_df.withColumn(
        "prev_year_value",
        F.lag(house_df['AVGPricePerYear']).over(wSpecPY))
    resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \
                             otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1)))
    print(f"""\nYear House price changes in {regionname} in GBP""")
    rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value',
                         'percent_change')
    rsY.show(36, False)
    s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'],
                     yearTable)
    print(f"""Created {yearTable}""")
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #4
0
def main():
    print(f"""Getting average yearly prices per region for all""")
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting Yearly percentages tables for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    # Get data from BigQuery table
    tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    print("\nStarted at")
    uf.println(lst)
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'],
                                tableName)
    summary_df = read_df.select(
        col("Year"),
        col("percent_change").alias("PercentYearlyChange"))
    p_df = summary_df.toPandas()
    print(p_df)
    p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange'])
    plt.xlabel("Year", fontdict=config['plot_fonts']['font'])
    plt.ylabel("Annual Percent Property Price change",
               fontdict=config['plot_fonts']['font'])
    plt.title(
        f"""Property price fluctuations in {regionname} for the past 10 years """,
        fontdict=config['plot_fonts']['font'])
    plt.margins(0.15)
    plt.subplots_adjust(bottom=0.25)
    plt.show()
    plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #5
0
  from conf import parameters as v

appName = "ukhouseprices"
spark = s.spark_session(appName)
spark.sparkContext._conf.setAll(v.settings)
sc = s.sparkcontext()
#
# Get data from Hive table
regionname = "Kensington and Chelsea"
tableName="ukhouseprices"
fullyQualifiedTableName = v.DSDB+'.'+tableName
summaryTableName = v.DSDB+'.'+'summary'
start_date = "2010-01-01"
end_date = "2020-01-01"
lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
print("\nStarted at");uf.println(lst)
if (spark.sql(f"""SHOW TABLES IN {v.DSDB} like '{tableName}'""").count() == 1):
    spark.sql(f"""ANALYZE TABLE {fullyQualifiedTableName} compute statistics""")
    rows = spark.sql(f"""SELECT COUNT(1) FROM {fullyQualifiedTableName}""").collect()[0][0]
    print("Total number of rows is ",rows)
else:
    print(f"""No such table {fullyQualifiedTableName}""")
    sys.exit(1)
f"""
https://stackoverflow.com/questions/59278835/pyspark-how-to-write-dataframe-partition-by-year-month-day-hour-sub-directory
"""
wSpecY = Window().partitionBy(F.date_format('datetaken',"yyyy"))

house_df = spark.sql(f"""select * from {fullyQualifiedTableName} where regionname = '{regionname}'""")
rows = spark.sql(f"""SELECT COUNT(1) FROM {fullyQualifiedTableName} where regionname = '{regionname}'""").collect()[0][0]
print(f"Total number of rows for {regionname} is ", rows)
コード例 #6
0
spark = s.spark_session(appName)
spark.sparkContext._conf.setAll(v.settings)
sc = s.sparkcontext()
#
# Get data from Hive table
regionname = "Kensington and Chelsea"
tableName = "ukhouseprices"
fullyQualifiedTableName = v.DSDB + '.' + tableName
summaryTableName = v.DSDB + '.' + 'summary'
start_date = "201001"
end_date = "202001"
lst = (spark.sql(
    "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
       ).collect()
print("\nStarted at")
uf.println(lst)
# Model predictions
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
summary_df = spark.sql(
    f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}"""
)
df_10 = summary_df.filter(
    col("datetaken").between(f'{start_date}', f'{end_date}'))
print(df_10.toPandas().columns.tolist())
p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF

# Non-Linear Least-Squares Minimization and Curve Fitting

# Define model to be Lorentzian and deploy it
model = LorentzianModel()
n = len(p_dfm.columns)
コード例 #7
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting plots for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #
    # Get data from BigQuery table
    summaryTableName = v.fullyQualifiedoutputTableId
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    # read data from the Bigquery table summary
    print("\nreading data from " + v.fullyQualifiedoutputTableId)

    summary_df = spark.read. \
                  format("bigquery"). \
                  option("credentialsFile",v.jsonKeyFile). \
                  option("project", v.projectId). \
                  option("parentProject", v.projectId). \
                  option("dataset", v.targetDataset). \
                  option("table", v.targetTable). \
        load()
    df_10 = summary_df.filter(F.col("Date").between(f'{start_date}', f'{end_date}')). \
        select(F.date_format('Date',"yyyyMM").cast("Integer").alias("date"), 'flatprice', 'terracedprice', 'semidetachedprice', 'detachedprice')
    df_10.printSchema()
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF
    # Non-Linear Least-Squares Minimization and Curve Fitting

    # Define model to be Lorentzian and deploy it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if p_dfm.columns[i] != "date":  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['date'])
            result.plot_fit()
            plt.margins(0.15)
            plt.subplots_adjust(bottom=0.25)
            plt.xticks(rotation=90)
            plt.xlabel("year/month", fontdict=v.font)
            plt.text(0.35,
                     0.45,
                     "Best-fit based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            plt.xlim(left=200900)
            plt.xlim(right=202100)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=v.font)
            plt.title(
                f"""Monthly {property} prices fluctuations in {regionname}""",
                fontdict=v.font)
            print(result.fit_report())
            plt.show()
            plt.close()

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #8
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    spark = s.setSparkConfBQ(spark)
    # Get data from BigQuery table
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'],
                                config['GCPVariables']['sourceTable'])
    df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \
            select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \
                 , round(col("flatprice")).alias("flatprice") \
                 , round(col("terracedprice")).alias("terracedprice")
                 , round(col("semidetachedprice")).alias("semidetachedprice")
                 , round(col("detachedprice").alias("detachedprice")))
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF

    # Non-Linear Least-Squares Minimization and Curve Fitting
    # Define model to be Lorentzian and depoly it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if (p_dfm.columns[i] != 'Date'):  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['Date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date'])
            # plot the data points, initial fit and the best fit
            plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data')
            plt.plot(p_dfm['Date'],
                     result.init_fit,
                     'k--',
                     label='initial fit')
            plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit')
            plt.legend(loc='upper left')
            plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font'])
            plt.text(0.35,
                     0.55,
                     "Fit Based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=config['plot_fonts']['font'])
            plt.title(
                f"""Monthly {property} price fluctuations in {regionname}""",
                fontdict=config['plot_fonts']['font'])
            plt.xlim(200901, 202101)
            print(result.fit_report())
            plt.show()
            plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #9
0
def main():
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    print(
        f"""Reading from parquet file {config['ParquetVariables']['sourceSmall']}"""
    )
    # read from the source file
    currentSnapshot = spark.read.load(
        config['ParquetVariables']['sourceSmall'])
    currentSnapshot.printSchema()
    #currentSnapshot.show()
    print(f"""\nRows in source file is""", currentSnapshot.count())
    print(currentSnapshot.rdd.getStorageLevel())
    currentSnapshot = currentSnapshot.repartition(5)
    print(currentSnapshot.rdd.getStorageLevel())
    # read from delta files
    deltaFile = "gs://etcbucket/randomdata/staging/randomdatapy_208150201_208150210"
    newAddedDeltaFiles = spark.read.load(deltaFile)
    # check missing records with source file
    # find out IDs that do not exist in source
    newAddedDeltaFiles.createOrReplaceTempView("newAddedDeltaFiles")
    currentSnapshot.createOrReplaceTempView("currentSnapshot")
    sqltext = """SELECT
                     newAddedDeltaFiles.ID
                   , newAddedDeltaFiles.CLUSTERED
                   , newAddedDeltaFiles.SCATTERED
                   , newAddedDeltaFiles.RANDOMISED
                   , newAddedDeltaFiles.RANDOM_STRING
                   , newAddedDeltaFiles.SMALL_VC
                   , newAddedDeltaFiles.PADDING 
                 FROM newAddedDeltaFiles 
                 LEFT OUTER JOIN currentSnapshot ON newAddedDeltaFiles.ID = currentSnapshot.ID
                 WHERE currentSnapshot.ID IS NULL ORDER BY newAddedDeltaFiles.ID"""
    print(f"""\nRows in deltafiles that do not exist in source file""",
          currentSnapshot.count())
    missingRows = spark.sql(sqltext)
    newSnapshot = currentSnapshot.union(missingRows)
    print(newSnapshot.orderBy(col("ID")).show(10000))
    sys.exit()
    #spark.sql(sqltext).write.mode(saveMode)
    print(
        f"""Writing to parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    df2.write.mode(config['ParquetVariables']['overwrite']).parquet(
        config['ParquetVariables']['targetLocation'])
    df3 = spark.read.load(config['ParquetVariables']['targetLocation'])
    print(
        f"""Reading from parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    print(f"""\nRows in target table is""", df3.count())
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
コード例 #10
0
    def main():
        appName = "ukhouseprices"
        spark = s.spark_session(appName)
        spark.sparkContext._conf.setAll(v.settings)
        sc = s.sparkcontext()
        #
        # Get data from Hive table
        regionname = "Kensington and Chelsea"
        tableName = "ukhouseprices"
        fullyQualifiedTableName = v.DSDB + "." + tableName
        summaryTableName = v.DSDB + "." + "summary"
        start_date = "2010"
        end_date = "2020"
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nStarted at")
        uf.println(lst)
        # Model predictions
        spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""")
        summary_df = spark.sql(
            f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices"""
        )
        df_10 = summary_df.filter(
            col("year").between(f'{start_date}', f'{end_date}'))
        print(df_10.toPandas().columns.tolist())

        # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear']
        p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF
        data = p_dfm.values

        # Non-Linear Least-Squares Minimization and Curve Fitting
        model = LorentzianModel()
        n = len(p_dfm.columns)
        for i in range(n):
            if p_dfm.columns[i] != 'year':  # year is x axis in integer
                # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
                vcolumn = p_dfm.columns[i]
                print(vcolumn)
                params = model.guess(p_dfm[vcolumn], x=p_dfm['year'])
                result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year'])
                result.plot_fit()

                # do linear regression here
                # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]):
                inputCols = ['year']
                vectorAssembler = VectorAssembler(inputCols=inputCols,
                                                  outputCol='features')
                vhouse_df = vectorAssembler.transform(df_10)
                vhouse_df = vhouse_df.select(
                    ['features', 'AVGFlatPricePerYear'])
                vhouse_df.show(20)
                if vcolumn == "AVGFlatPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Flat house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Flat price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.xlim(left=2009)
                    plt.xlim(right=2022)
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGTerracedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Terraced house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Terraced house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGSemiDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("semi-detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""semi-detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()

        p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear',
                            'AVGSemiDetachedPricePerYear',
                            'AVGDetachedPricePerYear').toPandas().describe()
        print(p_df)
        #axs = scatter_matrix(p_df, figsize=(10, 10))
        # Describe returns a DF where count,mean, min, std,max... are values of the index
        y = p_df.loc[['min', 'mean', 'max']]
        #y = p_df.loc[['averageprice', 'flatprice']]
        ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20)
        plt.grid(True)
        plt.xlabel("UK House Price Index, January 2020", fontdict=v.font)
        plt.ylabel("Property Prices in millions/GBP", fontdict=v.font)
        plt.title(
            f"""Property price fluctuations in {regionname} for the past 10 years """,
            fontdict=v.font)
        plt.legend(p_df.columns)
        plt.show()
        plt.close()
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nFinished at")
        uf.println(lst)
コード例 #11
0
def main():
    appName = "DS"
    spark = s.spark_session(appName)
    sc = s.sparkcontext()

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)

    tmp_bucket = "tmp_storage_bucket/tmp"

    # Set the temporary storage location
    spark.conf.set("temporaryGcsBucket", v.tmp_bucket)
    spark.sparkContext.setLogLevel("ERROR")

    HadoopConf = sc._jsc.hadoopConfiguration()
    HadoopConf.set("fs.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    HadoopConf.set("fs.AbstractFileSystem.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

    # needed filters

    start_date = "2010-01-01"
    end_date = "2020-01-01"

    spark.conf.set("GcpJsonKeyFile", v.jsonKeyFile)
    spark.conf.set("BigQueryProjectId", v.projectId)
    spark.conf.set("BigQueryDatasetLocation", v.datasetLocation)
    spark.conf.set("google.cloud.auth.service.account.enable", "true")
    spark.conf.set("fs.gs.project.id", v.projectId)
    spark.conf.set("fs.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    spark.conf.set("fs.AbstractFileSystem.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    spark.conf.set("temporaryGcsBucket", v.tmp_bucket)

    sqltext = ""
    from pyspark.sql.window import Window

    # read data from the Bigquery table in staging area
    print("\nreading data from " + v.projectId + ":" + v.inputTable)

    read_df = spark.read. \
                  format("bigquery"). \
                  option("credentialsFile",v.jsonKeyFile). \
                  option("project", v.projectId). \
                  option("parentProject", v.projectId). \
                  option("dataset", v.targetDataset). \
                  option("table", v.targetTable). \
                  option("temporaryGcsBucket", v.tmp_bucket). \
        load()
    summary_df == read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    summary_df.printSchema()
    rows = summary_df.count()
    print("Total number of rows for Kensington and Chelsea is ", rows)
    wSpecY = Window().partitionBy(F.date_format('date', "yyyy"))
    df2 = summary_df. \
                    select( \
                          F.date_format(F.col("date"),'yyyy').alias('Year') \
                        , F.round(F.avg(F.col("averageprice")).over(wSpecY)).alias('AVGPricePerYear') \
                        , F.round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , F.round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , F.round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , F.round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('date', asending=True)
    df2.show(10, False)
    # Save the result set to a BigQuery table. Table is created if it does not exist
    print(f"""\nsaving data to {v.DSDB}.yearlyhouseprices""")
    df2. \
        write. \
        format("bigquery"). \
        option("temporaryGcsBucket", v.tmp_bucket).\
        mode("overwrite"). \
        option("table", "DS.yearlyhouseprices"). \
        save()
    """
    summary_df. \
    write. \
    format("bigquery"). \
    mode("overwrite"). \
    option("table", v.fullyQualifiedoutputTableId). \
    option("temporaryGcsBucket", v.tmp_bucket). \
    save()
    """

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)