def execute(self, context): logging.info('Executing: ' + str(self.sql)) src_pg = PostgresHook(postgres_conn_id=self.src_postgres_conn_id) dest_pg = PostgresHook(postgres_conn_id=self.dest_postgress_conn_id) logging.info( "Transferring Postgres query results into other Postgres database." ) conn = src_pg.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.parameters) if self.pg_preoperator: logging.info("Running Postgres preoperator") dest_pg.run(self.pg_preoperator) logging.info("Inserting rows into Postgres") dest_pg.insert_rows(table=self.pg_table, rows=cursor) if self.pg_postoperator: logging.info("Running Postgres postoperator") dest_pg.run(self.pg_postoperator) logging.info("Done.")
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # RedShift Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Get number of records in the table records = redshift.get_records( f"SELECT COUNT(*) FROM {self.table_name}") data_rows = redshift.get_records(self.sql) if self.load_mode == "clean": # Clear data self.log.info(f"Clearing data from {self.table_name} table") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info( f"Deleted {records[0][0]} records from {self.table_name}") # Populate table self.log.info("Populating data to {} table".format(self.table_name)) redshift.insert_rows(table=self.table_name, rows=data_rows, commit_every=1000, replace=False) self.log.info("Inserted {} records to {}".format( len(data_rows), self.table_name))
def add_execution_in_database( table, data={}, connection_id="postgres_report_connection" ): """Registra informações em um banco PostgreSQL de forma dinâmica.""" data = dict(data) if data is None or len(data.keys()) == 0: logging.info( "Cannot insert `empty data` into the database. Please verify your data attributes." ) return hook = PostgresHook(postgres_conn_id=connection_id) try: hook.get_conn() except AirflowException: logging.info("Cannot insert data. Connection '%s' is not configured.", connection_id) return if data.get("payload"): data["payload"] = json.dumps(data["payload"]) columns = list(data.keys()) values = list(data.values()) try: hook.insert_rows(table, [values], target_fields=columns) except (AirflowException, ProgrammingError) as exc: logging.error(exc) else: logging.info("Registering `%s` into '%s' table.", data, table)
def predictions(filepath, table_name, test=True, **kwargs): df = preprocess(filepath) y_test = df["churn"] features = kwargs["ti"].xcom_pull(key="features", task_ids="train") with open(f"scaler_{kwargs.get('ds')}.pickle", 'rb') as f: scaler = load(f) df[['monthly_charges', 'extra_charges' ]] = scaler.transform(df[['monthly_charges', 'extra_charges']]) with open(f"log_model_{kwargs.get('ds')}.pickle", 'rb') as f: log = load(f) pred = log.predict(df[features]) hook = PostgresHook(postgres_conn_id=POSTGRES_CONN) if test: report = classification_report(y_test, pred, output_dict=True) hook.insert_rows(table_name, rows=[( datetime.now(), report["accuracy"], report["macro avg"]["recall"], report["macro avg"]["precision"], report["macro avg"]["f1-score"], )]) else: rows = df[["customer_id", "churn"]] rows["churn"] = pred.astype(bool) hook.insert_rows(table_name, rows=rows.values.tolist())
def copy_data_in_batch(target_base_table, fetch_iterator): masterdb_hook = PostgresHook() items = [] # setup SQL insert values the be executed for row in fetch_iterator: items.append([column_value for column_value in row]) result = len(items) # execute SQL insert statement # the Airflow PostgresHook.insert_rows instance method is used to "executemany" SQL query # which also serializes the data to a save SQL format if result: try: masterdb_hook.insert_rows(target_base_table, items, target_fields=None, commit_every=1000, replace=False) except Exception as e: raise Exception("Failed to insert batch data: {}".format( str(e)[0:150])) return result
def extract_airtable_shows(): ''' Load shows from airtable into a Postgres table Airtable shows are identified by their IMDB ID. This task stores the IDs and ratings only. ''' hook = PostgresHook(postgres_conn_id='postgres_movies') base = airtable.Airtable(base_id, api_key) ratings = [] for record in base.iterate(shows_table): fields = record['fields'] rating = fields.get('Mat Rating') imdb_url = fields.get('IMDB') if rating is not None and validate_imdb_url(imdb_url): ratings.append((imdb_url, rating)) logger.info('Rated %d movies', len(ratings)) hook.insert_rows('my_ratings', rows=ratings, target_fields=('imdb_url', 'stars'))
def upload_to_postgres(*args, **kwargs): ti = kwargs['ti'] file_date, data = ti.xcom_pull(task_ids="get_data_from_api") file_date = file_date.strftime("%Y-%m-%d %H:%M:%S%z") data = [extract_content_id(i) for i in list(data)] # DbApiHook.insert_rows pg_hook = PostgresHook(postgres_conn_id='postgres_conn', schema='seoulbike') rows = [ [st_data['stationName'], st_data['rackTotCnt'],st_data['parkingBikeTotCnt'], file_date] for st_data in data ] pg_hook.insert_rows(table='bike_realtime_log_tz', rows=rows) # autocommit pg_hook.get_conn().close()
def execute(self, context): pg_hook = PostgresHook(postgres_conn_id=self.connection_id) api_hook = Covid19Hook(http_conn_id='covid19-api', method='GET') pg_hook.run("DELETE FROM covid19 where day = %s", parameters=[context['ds']]) rows = map( lambda x: (json.dumps(x[0]), context['ds']), filter( lambda x: len(x), api_hook.get_data(start_date=context['ds'], end_date=context['ds']))) pg_hook.insert_rows("covid19", rows, ["data", "day"])
def move_mysql_to_redshift(tablename): mysql_hook = MySqlHook(mysql_conn_id='mysql_baseball') redshift_hook = PostgresHook(postgres_conn_id='redshift_host') sql = "select league_id, name, abbr from " sql += tablename cur = mysql_hook.get_records(sql) redshift_hook.insert_rows(tablename,cur) return cur;
def _insert_file_to_sql(**context): df_result = pd.read_csv(f"{STORE_DIR}/output.csv", names=["word", "count"]) df_result["date"] = context["ds"] if not df_result.empty: for c in df_result.columns: if c == 'count': df_result[c] = df_result[c].astype(float) df_result = df_result.squeeze() # squeezing single row dataframe df_tuple = [(df_result["date"], df_result["word"], df_result["count"])] hook = PostgresHook(postgres_conn_id=CONNECTION_ID) hook.insert_rows(SQL_TABLE, df_tuple)
def execute(self, context): self.log.info('LoadFactOperator for {}'.format(self.table)) # get records using sql_queries.songplay_table_insert pg_hook = PostgresHook(self.redshift_conn_id) records = pg_hook.get_records(SqlQueries.songplay_table_insert) self.log.info('get {} records from {}'.format(len(records), self.table)) self.log.info('{}'.format(records[0])) # insert records into songplays pg_hook.insert_rows(self.table, records) self.log.info('inserted into {}'.format(self.table))
def copy_table(self, mysql_conn_id, postgres_conn_id): print("### fetching records from MySQL table ###") mysqlserver = MySqlHook(mysql_conn_id) sql_query = "SELECT * from clean_store_transactions " data = mysqlserver.get_records(sql_query) print("### inserting records into Postgres table ###") postgresserver = PostgresHook(postgres_conn_id) postgres_query = "INSERT INTO clean_store_transactions VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);" postgresserver.insert_rows(table='clean_store_transactions', rows=data) return True
def copy_table(src_conn_id, dst_conn_id, table): src = PostgresHook(postgres_conn_id=src_conn_id) src_conn = src.get_conn() src_cursor = src_conn.cursor() dst = PostgresHook(postgres_conn_id=dst_conn_id) dst_conn = dst.get_conn() dst_cursor = dst_conn.cursor() dst_cursor.execute('SELECT MAX(id) FROM %s;', [table]) last_migrated_id = dst_cursor.fetchone()[0] if last_migrated_id is None: last_migrated_id = 0 src_cursor.execute('SELECT * FROM %s WHERE id > %s;', [table, last_migrated_id]) dst.insert_rows(table=table, rows=src_cursor)
def insert_data_into_postgres(**kwargs): ''' Inserting data from NFS to Destination DB ''' ti = kwargs['ti'] fetched_val = ti.xcom_pull(task_ids='read_configuration_dest') print(fetched_val) postgres_hook = PostgresHook(postgres_conn_id=fetched_val['conn_id'], schema=fetched_val['schema']) f = open("testfile.csv", "r") lst = [] for line in f.readlines(): print(line.replace('\n', '').split(',')[1:]) lst.append(tuple(line.replace('\n', '').split(',')[1:])) postgres_hook.insert_rows(table=fetched_val['tablename'], rows=iter(lst), target_fields=fetched_val['colname'].split(',')) print("------- Data stored successfully in Destination DB ------------")
def read_and_insert(**kwargs): filenames = glob("dags/source_files/*.json") rows_to_insert = [] ti = kwargs['ti'] table_name = ti.xcom_pull(task_ids='ddl', key='table_name') for fn in filenames: with open(fn) as f: lines = f.readlines() for line in lines: line_dict = json.loads(line) rows_to_insert.append(line_dict.values()) pg_hook = PostgresHook('postgres_default') pg_hook.insert_rows(table_name, rows_to_insert) rowcount = int(pg_hook.get_first("SELECT COUNT(1) FROM {table_name}".format(table_name=table_name))[0]) kwargs['ti'].xcom_push(key='rowcount', value=rowcount)
def execute(self, context): pg = PostgresHook(postgres_conn_id=self.pg_conn_id) data = open(self.source_file_name, 'r') lines = data.read().split('\n') tuples = [] for line in lines: if len(line) > 0: line_with_default = timestamp + '|' + self.source_file_name + '|' + line vals = list(line_with_default.split('|')) str_vals = map(lambda x: str(x), vals) tuples.append(str_vals) self.log.info("Inserting %s records to %s" % (str(len(tuples)), self.target_table)) pg.insert_rows(self.target_table, tuples, commit_every=1000)
def execute(self, context): self.log.info('LoadDimensionOperator for {}'.format(self.table)) # load records using sql_queries lookup = { 'users': SqlQueries.user_table_insert, 'songs': SqlQueries.song_table_insert, 'artists': SqlQueries.artist_table_insert, 'time': SqlQueries.time_table_insert, } pg_hook = PostgresHook('redshift') records = pg_hook.get_records(lookup[self.table]) self.log.info('get {} records from {}'.format(len(records), self.table)) # insert them into corresponding tables if self.replace: pg_hook.run(f"DELETE FROM {self.table}") pg_hook.insert_rows(self.table, records) self.log.info('inserted into {}'.format(self.table))
def execute(self, context): # read json data from s3 s3 = S3Hook(aws_conn_id=self.aws_conn_id) s3_obj = s3.get_key( key=self.filename, bucket_name=self.bucket_name ) bytes_buffer = io.BytesIO() s3_obj.download_fileobj(bytes_buffer) data_str = bytes_buffer.getvalue().decode('utf-8') data = json.loads(data_str) cols = ['id', 'publishedAt', 'title', 'description', 'categoryId', 'duration', 'definition', 'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount', 'embedHtml'] # prevent duplicates in playlist rows = set([tuple(d[k] for k in cols) for d in data]) # insert into target table postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) postgres.insert_rows( table=self.table, rows=rows )
def waze_jams_to_db(): """Waze jams feed to PostGIS.""" pg_hook = PostgresHook(postgres_conn_id='waze') tempfile = conf['temp_data_dir'] + '/waze_temp.csv' temp_df = pd.read_csv(tempfile, header=None, encoding='utf-8') rows_db = temp_df.values cols = ['uuid', 'waze_timestamp', 'street', 'start_node', 'end_node', 'city', 'length', 'delay', 'speed', 'level', 'road_type', 'geom'] logging.info('Pushing Waze data to Postgis.') pg_hook.insert_rows('public.waze_jams', rows_db, target_fields=cols, commit_every=0) return 'Successfully pushed data to PostGIS.'
def execute(self, context): pg_hook = PostgresHook(postgres_conn_id=self.connection_id) pg_hook_analytics = PostgresHook(postgres_conn_id=self.connection_id) day = context['ds'] pg_hook_analytics.run("DELETE FROM covid19_stats where day = %s", parameters=[day]) sql_read = """SELECT data #>> '{Country}' as country, day, sum((data #>> '{Confirmed}')::int) as confirmed, sum((data #>> '{Deaths}')::int) as deaths, sum((data #>> '{Recovered}')::int) as recovered, sum((data #>> '{Active}')::int) as active FROM covid19 where day =%s GROUP BY country, day""" rows = pg_hook.get_records(sql_read, parameters=[day]) pg_hook_analytics.insert_rows('covid19_stats', rows=rows, target_fields=[ 'country', 'day', 'confirmed', 'deaths', 'recovered', 'active' ])
def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) self.log.info("Calling HTTP method") response = http.run(self.endpoint) json_resp = json.loads(response.content) self.log.info("Response received, saving results to Postgres") pg = PostgresHook(postgres_conn_id=self.pg_conn_id) tuples = [] for res in json_resp: vals = list(res.values()) str_vals = map(lambda x: str(x), vals) tuples.append(str_vals) self.log.info("Inserting " + str(len(tuples)) + " records to Postgres") pg.insert_rows(self.destination_table, tuples)
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # RedShift Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Get number of records in the table records = redshift.get_records( f"SELECT COUNT(*) FROM {self.table_name}") # Fields and data df = redshift.get_pandas_df(self.sql) fields = list(df.columns.values) data_rows = redshift.get_records(self.sql) if self.load_mode == "clean": # Clear data self.log.info(f"Clearing data from {self.table_name} table") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info( f"Deleted {records[0][0]} records from {self.table_name}") else: job_execution_ts = self.filter_key[0].format(**context) next_job_execution_ts = self.filter_key[1].format(**context) filtered_df = df[(df['start_time'] >= job_execution_ts) & (df['start_time'] < next_job_execution_ts)] data_rows = [tuple(x) for x in filtered_df.values] # Populate table self.log.info("Populating data to {} table".format(self.table_name)) redshift.insert_rows(table=self.table_name, rows=data_rows, target_fields=fields, commit_every=1000, replace=False) self.log.info("Inserted {} records to {}".format( len(data_rows), self.table_name))
def fetch_and_save_features(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = { "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token") } pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook(postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5) def get_rock_content_item(originId): params = { "loadAttributes": "expanded", "$orderby": "ModifiedDateTime desc", "attributeKeys": "childrenHaveComments", } r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems/{originId}", params=params, headers=headers) return r.json() # Rock transports { foo: 'bar', baz: 'bat' } as 'foo^bar|baz^bat` def parse_key_value_attribute(value): if value == '': return [] entries = value.split('|') def string_to_dict(s): k, v = s.split('^') return {"key": k, "value": unquote(v)} return map(string_to_dict, entries) # def get_ def get_features(content): features = [] postgres_id, parent_rock_id = pg_hook.get_first( f'select "contentItems".id, "parentItem"."originId" from "contentItems" left outer join "contentItems" as "parentItem" on "parentItem".id = "contentItems"."parentId" where "contentItems"."originId" = \'{content["Id"]}\'' ) # Add features from a key value list # This is now the only way to add scripture and text features # We previously supported a textFeature and key_value_features = parse_key_value_attribute( safeget_no_case(content, 'AttributeValues', 'Features', 'Value') or '') for kv in key_value_features: feature_type = kv["key"] if feature_type == "scripture": features.append({ "type": "Scripture", "data": { "reference": kv["value"] }, "parent_id": postgres_id }) if feature_type == "text": features.append({ "type": "Text", "data": { "text": kv["value"] }, "parent_id": postgres_id }) button_link_feature = safeget_no_case(content, 'AttributeValues', 'ButtonText', 'Value') if (button_link_feature): features.append({ "type": "ButtonLink", "data": { "title": safeget_no_case(content, 'AttributeValues', 'ButtonText', 'Value'), "url": safeget_no_case(content, 'AttributeValues', 'ButtonLink', 'Value'), "action": "OPEN_AUTHENTICATED_URL", }, "parent_id": postgres_id }) comment_feature = safeget_no_case(content, 'AttributeValues', 'comments', 'value') or 'False' if comment_feature == 'True': features.append({ "type": "AddComment", "data": { "initialPrompt": "Write Something...", "addPrompt": "What stands out to you?" }, "parent_id": postgres_id }) features.append({ "type": "CommentList", "data": { "initialPrompt": "Write Something...", "addPrompt": "What stands out to you?" }, "parent_id": postgres_id }) elif parent_rock_id: parent_item = get_rock_content_item(parent_rock_id) parent_comment_feature = safeget_no_case( parent_item, 'AttributeValues', 'childrenHaveComments', 'value') or 'False' if parent_comment_feature == 'True': features.append({ "type": "AddComment", "data": { "initialPrompt": "Write Something...", "addPrompt": "What stands out to you?" }, "parent_id": postgres_id }) features.append({ "type": "CommentList", "data": { "initialPrompt": "Write Something...", "addPrompt": "What stands out to you?" }, "parent_id": postgres_id }) return features fetched_all = False skip = 0 top = 10000 while not fetched_all: # Fetch people records from Rock. params = { "$top": top, "$skip": skip, # "$select": "Id,Content", "loadAttributes": "expanded", "$orderby": "ModifiedDateTime desc", "attributeKeys": "features, comments, buttontext, buttonlink", } r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems", params=params, headers=headers) rock_objects = r.json() def fix_casing(col): return "\"{}\"".format(col) features_by_item = list(map(get_features, rock_objects)) print(features_by_item) flat_features_list = [ item for items in features_by_item for item in items ] # "createdAt", "updatedAt", "apollosType", "data", "type", "parentId", "parentType" def feature_attributes(obj): return (kwargs['execution_date'], kwargs['execution_date'], obj["type"] + "Feature", Json(obj['data']), obj["type"], obj['parent_id'], "ContentItem") insert_features = list(map(feature_attributes, flat_features_list)) columns = list( map(fix_casing, ("createdAt", "updatedAt", "apollosType", "data", "type", "parentId", "parentType"))) pg_hook.insert_rows('"features"', list(insert_features), columns, 0, True, replace_index=('"parentId"', '"type"', '"data"')) add_apollos_ids = """ UPDATE "features" SET "apollosId" = "apollosType" || ':' || id::varchar WHERE "apollosId" IS NULL """ pg_hook.run(add_apollos_ids) skip += top fetched_all = len(r.json()) < top
def fetch_and_save_people(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = { "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token") } fetched_all = False skip = 0 top = 10000 pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook(postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5) # Rock stores gender as an integer gender_map = ('UNKNOWN', 'MALE', 'FEMALE') # Fetch the available campuses. campuses = pg_hook.get_records(""" SELECT "originId", id FROM campuses WHERE "originType" = 'rock' """) campus_map = dict(campuses) campus_map['None'] = None while fetched_all == False: # Fetch people records from Rock. params = { "$top": top, "$skip": skip, "$expand": "Photo", "$select": "Id,NickName,LastName,Gender,BirthDate,PrimaryCampusId,Email,Photo/Path", "$orderby": "ModifiedDateTime desc", } if not kwargs['do_backfill']: params[ '$filter'] = f"ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null" print(params) r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/People", params=params, headers=headers) rock_objects = r.json() if not isinstance(rock_objects, list): print(rock_objects) print("oh uh, we might have made a bad request") print("top: {top}") print("skip: {skip}") skip += top continue skip += top fetched_all = len(rock_objects) < top def photo_url(path): if path is None: return None elif path.startswith("~"): rock_host = (Variable.get(kwargs['client'] + '_rock_api')).split("/api")[0] return path.replace("~", rock_host) else: return path # "createdAt", "updatedAt", "originId", "originType", "apollosType", "firstName", "lastName", "gender", "birthDate", "campusId", "email", "profileImageUrl" def update_people(obj): return (kwargs['execution_date'], kwargs['execution_date'], obj['Id'], 'rock', 'Person', clean_string(obj['NickName']), clean_string(obj['LastName']), gender_map[obj['Gender']], obj['BirthDate'], campus_map[str(obj["PrimaryCampusId"])], clean_string(obj['Email']), photo_url(safeget(obj, 'Photo', 'Path'))) def fix_casing(col): return "\"{}\"".format(col) people_to_insert = list(map(update_people, rock_objects)) columns = list( map(fix_casing, ("createdAt", "updatedAt", "originId", "originType", "apollosType", "firstName", "lastName", "gender", "birthDate", "campusId", "email", "profileImageUrl"))) pg_hook.insert_rows('people', people_to_insert, columns, 0, True, replace_index=('"originId"', '"originType"')) add_apollos_ids = """ UPDATE people SET "apollosId" = 'Person:' || id::varchar WHERE "originType" = 'rock' and "apollosId" IS NULL """ pg_hook.run(add_apollos_ids)
def fetch_and_save_persona_tags(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")} fetched_all = False skip = 0 top = 10000 pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook( postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5 ) person_entity_id = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes", params={"$filter": "Name eq 'Rock.Model.Person'"}, headers=headers ).json()[0]['Id'] while not fetched_all: params = { "$top": top, "$skip": skip, "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}", "$select": "Id,Name,Guid", "$orderby": "ModifiedDateTime desc", } if not kwargs['do_backfill']: params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'" print(params) r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews", params=params, headers=headers) rock_objects = r.json() if not isinstance(rock_objects, list): print(rock_objects) print("oh uh, we might have made a bad request") print("top: {top}") print("skip: {skip}") skip += top continue skip += top fetched_all = len(rock_objects) < top # "createdAt", "updatedAt", "originId", "originType", "apollosType", "name", "data", "type" def update_tags(obj): return ( kwargs['execution_date'], kwargs['execution_date'], obj['Id'], 'rock', 'Tag', obj['Name'], Json({ "guid": obj["Guid"] }), "Persona" ) def fix_casing(col): return "\"{}\"".format(col) tags_to_insert = list(map(update_tags, rock_objects)) columns = list(map(fix_casing, ("createdAt","updatedAt", "originId", "originType", "apollosType", "name", "data", "type"))) pg_hook.insert_rows( 'tags', tags_to_insert, columns, 0, True, replace_index = ('"originId"', '"originType"') ) add_apollos_ids = """ UPDATE "tags" SET "apollosId" = "apollosType" || ':' || id::varchar WHERE "originType" = 'rock' and "apollosId" IS NULL """ pg_hook.run(add_apollos_ids)
def load_data(table_name, filepath, **kwargs): hook = PostgresHook(postgres_conn_id=POSTGRES_CONN) df = pd.read_csv(filepath, dtype=TYPES) hook.insert_rows(table_name, rows=df.values.tolist())
def fetch_and_save_content_items(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = { "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token") } fetched_all = False skip = 0 top = 10000 pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook(postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5) while fetched_all == False: # Fetch people records from Rock. params = { "$top": top, "$skip": skip, # "$expand": "Photo", # "$select": "Id,Content", "loadAttributes": "expanded", "attributeKeys": "Summary", "$orderby": "ModifiedDateTime desc", } if not kwargs['do_backfill']: params[ '$filter'] = f"ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null" print(params) r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems", params=params, headers=headers) rock_objects = r.json() if not isinstance(rock_objects, list): print(rock_objects) print("oh uh, we might have made a bad request") print("top: {top}") print("skip: {skip}") skip += top continue skip += top fetched_all = len(rock_objects) < top config = Variable.get(kwargs['client'] + "_rock_config", deserialize_json=True) # "createdAt","updatedAt", "originId", "originType", "apollosType", "summary", "htmlContent", "title", "publishAt" def update_content(obj): return ( kwargs['execution_date'], kwargs['execution_date'], obj['Id'], 'rock', get_typename(obj, config), create_summary(obj), create_html_content(obj), obj['Title'], obj['StartDateTime'], ) def fix_casing(col): return "\"{}\"".format(col) content_to_insert = list(map(update_content, rock_objects)) columns = list( map(fix_casing, ("createdAt", "updatedAt", "originId", "originType", "apollosType", "summary", "htmlContent", "title", 'publishAt'))) pg_hook.insert_rows('"contentItems"', content_to_insert, columns, 0, True, replace_index=('"originId"', '"originType"')) add_apollos_ids = """ UPDATE "contentItems" SET "apollosId" = "apollosType" || ':' || id::varchar WHERE "originType" = 'rock' and "apollosId" IS NULL """ pg_hook.run(add_apollos_ids)
def fetch_and_save_media(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = { "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token") } pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook(postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5) def get_content_item_id(rockOriginId): return pg_hook.get_first( f'SELECT id FROM "contentItems" WHERE "originId"::Integer = {rockOriginId}' )[0] def mapContentItems(contentItem): nodeId = get_content_item_id(contentItem['Id']) def filter_media_attributes(attribute): return is_media_image(attribute, contentItem) or is_media_video( attribute, contentItem) or is_media_audio( attribute, contentItem) def get_media_type(attribute): if (is_media_image(attribute, contentItem)): return 'IMAGE' elif (is_media_video(attribute, contentItem)): return 'VIDEO' elif (is_media_audio(attribute, contentItem)): return 'AUDIO' else: return 'UNKNOWN_MEDIA_TYPE' def get_media_value(attribute): mediaType = get_media_type(attribute) attributeKey = attribute['Key'] attributeValue = contentItem['AttributeValues'][attributeKey][ 'Value'] if (mediaType == 'IMAGE'): rock_host = (Variable.get(kwargs['client'] + '_rock_api')).split("/api")[0] return rock_host + "/GetImage.ashx?guid=" + attributeValue if len( attributeValue) > 0 else '' else: return attributeValue def map_attributes(attribute): attributeKey = attribute['Key'] attributeFieldType = attribute['FieldTypeId'] attributeValue = contentItem['AttributeValues'][attributeKey][ 'Value'] attributeValueId = str(contentItem['Id']) + "/" + str( attribute['Id']) mediaType = get_media_type(attribute) mediaValue = get_media_value(attribute) metadata = {} if (mediaType == 'IMAGE' and mediaValue): imageDimensions = getsizes(mediaValue) metadata['width'] = imageDimensions[0] metadata['height'] = imageDimensions[1] if (mediaValue): return ('Media', kwargs['execution_date'], kwargs['execution_date'], nodeId, 'ContentItem', mediaType, mediaValue, attributeValueId, 'rock', json.dumps(metadata)) return None filteredAttributes = filter(filter_media_attributes, contentItem['Attributes'].values()) mappedAttributes = map(map_attributes, filteredAttributes) return list(filter(lambda media: bool(media), mappedAttributes)) fetched_all = False skip = 0 top = 10000 while fetched_all == False: # Fetch people records from Rock. params = { "$top": top, "$skip": skip, # "$select": "Id,Content", "loadAttributes": "expanded", "$orderby": "ModifiedDateTime desc", } r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems", params=params, headers=headers) def fix_casing(col): return "\"{}\"".format(col) mediaAttributeLists = list(map(mapContentItems, r.json())) mediaAttributes = [ mediaAttribute for sublist in mediaAttributeLists for mediaAttribute in sublist ] columns = list( map(fix_casing, ('apollosType', 'createdAt', 'updatedAt', 'nodeId', 'nodeType', 'type', 'url', 'originId', 'originType', 'metadata'))) print('Media Items Added: ') print(len(list(mediaAttributes))) pg_hook.insert_rows('"media"', list(mediaAttributes), columns, 0, True, replace_index=('"originId"', '"originType"')) add_apollos_ids = """ UPDATE "media" SET "apollosId" = "apollosType" || ':' || id::varchar WHERE "originType" = 'rock' and "apollosId" IS NULL """ pg_hook.run(add_apollos_ids) skip += top fetched_all = len(r.json()) < top
def insertRow(tableName, **context): userName = context['ti'].xcom_pull(key=None, task_ids=push_current_user) hook = PostgresHook() hook.insert_rows(table=tableName, rows=[(random.randint(0, 10000000), userName)], target_fields=["id", "userName"])