def get_influx_dataframe(table, influx_url=INFLUX_URL, s3_bucket_for_bookmark=S3_BUCKET_FOR_BOOKMARK, bookmark_file=BOOKMARK_FILE): """ Reads table from URL of influx and returns it as PySpark DataFrame Arguments --------- table (str): Table to be retrieved from Influx Database influx_url (str): URL of influx database s3_bucket_for_bookmark (str): Name of s3 bucket containing bookmark bookmark_file (str): Location of file in s3 bucket containing bookmark Returns ------- influx_df (pyspark.sql.DataFrame): Influx database retrieved as pyspark.sql.DataFrame where time is datetime-encoded """ s3 = boto3.resource('s3') obj = s3.Object(s3_bucket_for_bookmark, bookmark_file) insert_into_timestamp = obj.get()['Body'].read().decode('utf-8') print("Inserted into timestamp: " + insert_into_timestamp) # Converting insert_into_timestamp into datetime format. # NOTE: Python's `datetime` module supports only until microseconds. So, # insert_into_timestamp has been trimmed from nanoseconds to microseconds # for parsing into datetime.strptime() temp = insert_into_timestamp[:-4] datetime_insert_timestamp = datetime.datetime.strptime(temp, '%Y-%m-%dT%H:%M:%S.%f') glueContext = GlueContext(SparkContext.getOrCreate()) if table == TABLE1: query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d" elif table == TABLE2: query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d" elif table == TABLE3: query = f"SELECT * FROM {table} WHERE time > now() - 3d and time < now() - 1d" params = {"pretty": "false", "q": query} #params = {"pretty": "false", "q": "SELECT * FROM "+influxTable+" WHERE time >'"+last_processed_timestamp+"' order by time desc"} r = requests.get(influx_url, params=params) data = json.loads(r.text) # Retreiving data column names and values from JSON values = data["results"][0]["series"][0]["values"] columns = data["results"][0]["series"][0]["columns"] # Data is inserted after timestamp of first value row inserted_into_timestamp = values[0][0] # Defining schema for the data column_structfields = [StructField(column, StringType(), True) for column in columns] schema = StructType(column_structfields) # Creating new dataframe with above-defined schema df = glueContext.createDataFrame(values, schema) # Typecasting 'time' in 'df' to 'timestamp' format new_df = df.withColumn("time", df["time"].cast("timestamp")) new_df = new_df.withColumn("time_string", date_format(new_df.time, "yyyy-MM-dd hh:mm:ss")) return new_df
job.init(args['JOB_NAME'], args) ### CODE STARTS HERE ### # Create an empty dictionary and use it to load the contents of the JSON file taken from "Referenced files path": filejson = {} with open('areas.json', 'r') as inputfile: filejson = eval(inputfile.read()) # Create a Spark DataFrame with data from the JSON file: df = glueContext.createDataFrame([(filejson[0]["id"], filejson[0]["name"]), (filejson[1]["id"], filejson[1]["name"]), (filejson[2]["id"], filejson[2]["name"]), (filejson[3]["id"], filejson[3]["name"]), (filejson[4]["id"], filejson[4]["name"]), (filejson[5]["id"], filejson[5]["name"]), (filejson[6]["id"], filejson[6]["name"]), (filejson[7]["id"], filejson[7]["name"]), (filejson[8]["id"], filejson[8]["name"]), (filejson[9]["id"], filejson[9]["name"])], ['id', 'name']) # Check 'df' contents: df.printSchema() df.show() # Convert 'df' to Glue DynamicFrame: dyf = DynamicFrame.fromDF(df, glueContext, "dftodyf") # Repartition to 12 to achieve maximum parallelization based on my DPUs config: dyf_rep = dyf.repartition(12)
StructField("steve_ross", LongType()), StructField("structure", LongType()), StructField("sun", LongType()), StructField("tomb_frame", LongType()), StructField("tree", LongType()), StructField("trees", LongType()), StructField("triple_frame", LongType()), StructField("waterfall", LongType()), StructField("waves", LongType()), StructField("windmill", LongType()), StructField("window_frame", LongType()), StructField("winter", LongType()), StructField("wood_framed", LongType()) ]) paintings_data_frame = glueContext.createDataFrame(paintings_source, schema = paintings_schema) paintings = DynamicFrame.fromDF(paintings_data_frame, glueContext, 'dyf') # Cast all "bit" fields (LongTypes) into booleans # It's easier to use a list of non-bit fields as the majority of fields imported are bit fields non_bit_fields = ["episode", "title"] bit_fields_specs = [ (field.name, "cast:boolean") for field in paintings.schema() if field.name not in non_bit_fields and field.dataType.typeName() == 'long' # Type-check to provide accidentally casting a non-bit column if not in "non_bit_fields" ] paintings_with_bool_fields = ResolveChoice.apply(paintings, specs = bit_fields_specs) # Parse and clean up the season, episode, and episode text fields def normalize_episode_fields(record): # Parse the season and episode numbers