Python loadTableFromBQの例、sparkutils.sparkstuff.loadTableFromBQ Pythonの例

コード例 #1

0

ファイルを表示

def SendToBigQuery(df, batchId):

    if (len(df.take(1))) > 0:
        #df.printSchema()
        df.persist()
        # read from redis table
        spark_session = s.spark_session(config['common']['appName'])
        spark_session = s.setSparkConfBQ(spark_session)
        # read from BigQuery
        read_df = s.loadTableFromBQ(spark_session,
                                    config['MDVariables']['targetDataset'],
                                    config['MDVariables']['targetTable'])
        #read_df = s.loadTableFromRedis(spark_session, config['RedisVariables']['targetTable'], config['RedisVariables']['keyColumn'])
        # Write data to config['MDVariables']['targetTable'] in BigQuery
        # look for high value tickers
        for row in df.rdd.collect():
            rowkey = row.rowkey
            ticker = row.ticker
            price = row.price
            values = bigQueryAverages(ticker, price, read_df)
            Average = values["average"]
            standardDeviation = values["standardDeviation"]
            lower = values["lower"]
            upper = values["upper"]
            if lower is not None and upper is not None:
                hvTicker = priceComparison(ticker, price, lower, upper)
                if (hvTicker == 1):
                    writeHighValueData(df, rowkey)
        df.unpersist()
    else:
        print("DataFrame is empty")

コード例 #2

0

ファイルを表示

ファイル: workoutYearlyAveragePricesALLGCP.py プロジェクト: michTalebzadeh/DataScienceBQ

def main():
    print (f"""Getting average yearly prices per region for all""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nStarted at");uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname')
    house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable'])
    house_df.printSchema()
    house_df.show(2, False)

    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('Date', 'yyyy').cast("Integer").alias('year') \
                        , 'regionname' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('Date', asending=True)
    df2.show(20,False)
    s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable'])
    print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""")
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nFinished at");uf.println(lst)

コード例 #3

0

ファイルを表示

ファイル: GCP_Hive.py プロジェクト: michTalebzadeh/SparkStructuredStreaming

 def loadIntoBQTable(self, df2):
     # write to BigQuery table
     s.writeTableToBQ(df2, "overwrite",
                      config['GCPVariables']['targetDataset'],
                      config['GCPVariables']['yearlyAveragePricesAllTable'])
     print(
         f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}"""
     )
     # read data to ensure all loaded OK
     read_df = s.loadTableFromBQ(
         self.spark, config['GCPVariables']['targetDataset'],
         config['GCPVariables']['yearlyAveragePricesAllTable'])
     # check that all rows are there
     if df2.subtract(read_df).count() == 0:
         print("Data has been loaded OK to BQ table")
     else:
         print("Data could not be loaded to BQ table, quitting")
         sys.exit(1)

コード例 #4

0

ファイルを表示

def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Creating Yearly percentage tables for {regionname}""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #    # Get data from BigQuery table
    tableName = "yearlyaveragepricesAllTable"
    start_date = "2010"
    end_date = "2020"
    yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    read_df = s.loadTableFromBQ(
        spark, config['GCPVariables']['targetDataset'],
        config['GCPVariables']['yearlyAveragePricesAllTable'])
    house_df = read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    wSpecPY = Window().orderBy('regionname', 'Year')
    df_lagY = house_df.withColumn(
        "prev_year_value",
        F.lag(house_df['AVGPricePerYear']).over(wSpecPY))
    resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \
                             otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1)))
    print(f"""\nYear House price changes in {regionname} in GBP""")
    rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value',
                         'percent_change')
    rsY.show(36, False)
    s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'],
                     yearTable)
    print(f"""Created {yearTable}""")
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)

コード例 #5

0

ファイルを表示

def main():
    print(f"""Getting average yearly prices per region for all""")
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting Yearly percentages tables for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    # Get data from BigQuery table
    tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    print("\nStarted at")
    uf.println(lst)
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'],
                                tableName)
    summary_df = read_df.select(
        col("Year"),
        col("percent_change").alias("PercentYearlyChange"))
    p_df = summary_df.toPandas()
    print(p_df)
    p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange'])
    plt.xlabel("Year", fontdict=config['plot_fonts']['font'])
    plt.ylabel("Annual Percent Property Price change",
               fontdict=config['plot_fonts']['font'])
    plt.title(
        f"""Property price fluctuations in {regionname} for the past 10 years """,
        fontdict=config['plot_fonts']['font'])
    plt.margins(0.15)
    plt.subplots_adjust(bottom=0.25)
    plt.show()
    plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)

コード例 #6

0

ファイルを表示

ファイル: plotLorentzianPerRegionGCP.py プロジェクト: michTalebzadeh/DataScienceBQ

def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    spark = s.setSparkConfBQ(spark)
    # Get data from BigQuery table
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'],
                                config['GCPVariables']['sourceTable'])
    df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \
            select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \
                 , round(col("flatprice")).alias("flatprice") \
                 , round(col("terracedprice")).alias("terracedprice")
                 , round(col("semidetachedprice")).alias("semidetachedprice")
                 , round(col("detachedprice").alias("detachedprice")))
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF

    # Non-Linear Least-Squares Minimization and Curve Fitting
    # Define model to be Lorentzian and depoly it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if (p_dfm.columns[i] != 'Date'):  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['Date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date'])
            # plot the data points, initial fit and the best fit
            plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data')
            plt.plot(p_dfm['Date'],
                     result.init_fit,
                     'k--',
                     label='initial fit')
            plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit')
            plt.legend(loc='upper left')
            plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font'])
            plt.text(0.35,
                     0.55,
                     "Fit Based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=config['plot_fonts']['font'])
            plt.title(
                f"""Monthly {property} price fluctuations in {regionname}""",
                fontdict=config['plot_fonts']['font'])
            plt.xlim(200901, 202101)
            print(result.fit_report())
            plt.show()
            plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)