def hasTable(self, sqlc): # Check if the table exists # There must be a better way than this pg = postgres.PostgresConnector() qr = "(SELECT 1 FROM pg_tables WHERE tablename='" + self.tableName + "') AS wtf" df = pg.read(sqlc, table=qr) return df.count() == 1
def writeResults(self, prefix): pgTableName = prefix + self.year + '_' + self.month + '_' + 'st' + str( self.station) conn = postgres.PostgresConnector() conn.write(self.hr, pgTableName, glb('pgWriteMode'), db='taxi_aggregates')
def readTable(self, sqlc): # To speed things up, define partition scheme here: mStart = self.year + '-' + self.month + '-01 00:00:00' mEnd = str(int(self.year)+(int(self.month))/12) + '-' + \ str((int(self.month)+1)%12).zfill(2).replace('00','12') + '-01 00:00:00' lb = dtt.strToTimeStamp(mStart) - 3600 * 24 # from 1 day prior ub = dtt.strToTimeStamp(mEnd) + 3600 * 24 # upto 1 day after pg = postgres.PostgresConnector() qr = "(SELECT * FROM " + self.tableName + ") AS wtf" self.df = pg.read(sqlc, table=qr, numPartitions=32, column="pUTimeStamp", \ lowerBound=lb, upperBound=ub)
def writeToPostgres(self, prefix): # There are two different kind of schemas: # Prior to 2017, use coordinates; after, location ID if 'pULocId' in self.ylwTaxi.columns: keepCols = glb('pgKeepCols1') else: keepCols = glb('pgKeepCols2') dropCols = [clm for clm in self.ylwTaxi.columns if clm not in keepCols] for clm in dropCols: self.ylwTaxi = self.ylwTaxi.drop(clm) self.ylwTaxi = self.ylwTaxi.select(keepCols) self.pgTableName = prefix + '_' + self.year + '_' + self.month connector = postgres.PostgresConnector() connector.write(self.ylwTaxi, self.pgTableName, glb('pgWriteMode'))
def WriteTables(self, processed_df): # Get list of libraries from S3 for which you want activity trends libinfo_df = self.spark.read.csv( "s3a://gauravdatabeamdata/LibraryInfo.csv", header=True, multiLine=True) libraries_list = libinfo_df.select(libinfo_df.Libraries).collect() liblist = [] for row in libraries_list: liblist.append(str(row.Libraries)) collocatedlibs = GetCollocatedLibraries() libs_indandcoll = collocatedlibs.GetLibraryPairs(liblist) print("Getting postgre connector..............................") connector = postgres.PostgresConnector() for item in libs_indandcoll: print(item + '....................................................') for lib_ind_pair in libs_indandcoll: # pick out libraries which exist in processed dataframe lib_df = processed_df.where( processed_df.library == lib_ind_pair).select( "datetime", "lib_counts") # save datetime(year-month), lib_counts(users) in a table for each library if len(lib_df.head(1)) > 0: print("Saving table %s into Postgres........................" % lib_ind_pair) self.write_to_postgres(lib_df, lib_ind_pair, connector) else: continue
def process_stream(self, rdd): if rdd.isEmpty(): print("no incoming data") else: """convert the data from rdd to dataframes""" data_frame = rdd.toDF().cache() """calculating the mean of an array"""" array_mean = udf(lambda x: float(np.mean(x)), FloatType()) """calculating the square of each element in an array""" def square_list(array_list): return [float(val)**2 for val in array_list] square_list_udf = udf(lambda y: square_list(y), ArrayType(FloatType())) """adding new columns to the dataframe""" df_square = data_frame.select('*', square_list_udf('x').alias("sq_x"), \ square_list_udf('y').alias("sq_y"), square_list_udf('z').alias("sq_z")) df_average = df_square.select("*", array_mean("sq_x").alias("avg_x"), \ array_mean("sq_y").alias("avg_y"), array_mean("sq_z").alias("avg_z")) """calculating the gal value for predicting earthquake and wrting to the data frame""" final_df = df_average.select("*", pow(col("avg_x")+ col("avg_y")+ \ col("avg_z"), 0.5).alias("gal")) """writing the data to the postgres""" try: connector = postgres.PostgresConnector( "ec2-18-232-24-132.compute-1.amazonaws.com", "earthquake", "postgres", "nidheesh") connector.write(final_df, "Ereadings", "append") except Exception as error: print(error) pass
def write_events_to_db(self, df): table = 'airflow_events' mode = 'append' connector = postgres.PostgresConnector() connector.write_to_db(df, table, mode)
def write_events_to_db(self, df): table = 'safety_score' mode = 'append' connector = postgres.PostgresConnector() connector.write_to_db(df, table, mode)
def write_to_postgres(out_df, table=table_name): """ function to write output to postgres""" table = table_name mode = "append" connector = postgres.PostgresConnector() connector.write(out_df, table, mode)
def __init__(self): self.pgres_connector = postgres.PostgresConnector() self.spark = SparkSession \ .builder \ .appName("plops_streaming") \ .getOrCreate()
def write_to_postgres(self, out_df): table = "spark_out_hist_occupancy" mode = "append" connector = postgres.PostgresConnector() connector.write(out_df, table, mode)