def drop_trigger_type_notification(db, schema, table, name, proc): cmd = "DROP TRIGGER IF EXISTS %s ON %s.%s CASCADE" % (name, schema, table) try: logger.info("[TABLE] Dropping trigger '%s'" % name) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Dropping trigger '%s' failed: %s" % (name, ex))
def add_pk(db, schema, table, attr): """ Adds primary key to a PostgreSQL table. Parameters ---------- db : obj schema : string table : string attr : string Returns ------- - Example ------- add_pk(pg, 'public', 'employee', 'id') """ cmd = 'ALTER TABLE %s.%s ADD PRIMARY KEY (%s)' % (schema, table.lower(), attr) try: db.execute_cmd(cmd) except Exception as ex: logger.error( """ Failed to add primary key to table %s. Details: %s """ % (table, ex), CURR_FILE)
def add_multiple_columns(db, schema, table, attrs, types): """ Add new column to a specific table. Parameters ---------- name : str column_name : str column_type : str Example ------- add_multiple_columns(db, ['nyanya', some_integer'], ['text', integer']) """ statements_add = [] attrs_types = zip(attrs, types) for i, j in attrs_types: statements_add.append(' '.join(['ADD COLUMN IF NOT EXISTS', i, j])) statements_merged = ', '.join(statements_add) cmd = "ALTER TABLE IF EXISTS %s.%s %s;" % (schema, table.lower(), statements_merged) for i, j in zip(attrs, types): logger.warn("Adding column %s (%s) to company %s." % (i, j, table.lower())) try: db.execute_cmd(cmd) except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def exists(db, schema, table): """ Check if a table exists in the PG database. Parameters ---------- table : string Returns ------- True: table exists in the database False: otherwise """ cmd = """ SELECT table_name FROM information_schema.tables WHERE table_schema='%s' AND table_name='%s'; """ % (schema, table.lower()) try: res = db.execute_cmd_with_fetch(cmd) if res: return True else: return False except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def log_rows(db, schema, values): """ Holds the operation, relation name, object id and timestamp for each entry of the oplog. Parameters ---------- Returns ------- - Example ------- create_oplog_table(pg, 'purr') """ table_name = "purr_oplog" # id is SERIAL type, we can skip it when inserting rows: attrs = table_desc[table_name]["attrs"][1:] try: row.insert_bulk(db, schema, table_name, attrs, values) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to insert logs into table %s: %s" % (table_name, ex))
def create_log_error_table(db, schema='public'): """ Logs the error's location, message and timestamp when an it occurs. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_log_error_table(pg, 'purr') """ table_name = "purr_error" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [int(time.time())] try: table.create(db, schema, table_name, attrs, types) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def create_transfer_stats_table(db, schema='public'): """ Logs the number, relation name, timestamp for each collection transfer. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_transfer_stats_table(pg, 'purr') """ table_name = "purr_transfer_stats" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [int(time.time())] try: table.create(db, schema, table_name, attrs, types) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def drop_type_notification(db, name): """ Drops the function which creates a notification. The notification created by this function is sent to channel 'purr' and signalizes a type change. Parameters ---------- db : obj : Postgres connection object name: sting : name of the function Returns ------- - Example ------- drop_notification(db, function_name) """ cmd = "DROP FUNCTION IF EXISTS %s();" % name try: db.execute_cmd(cmd) logger.info("Dropping procedure: %s" % name, CURR_FILE) except Exception as ex: logger.error("Dropping procedure failed: %s" % ex, CURR_FILE)
def create_type_notification(db, name): """ Creates a function which will notify channel 'purr' about type changes. Parameters ---------- db : obj : Postgres connection object name: sting : name of the function Returns ------- - Example ------- create_notification(db, name_function) """ cmd = """CREATE OR REPLACE FUNCTION %s() RETURNS TRIGGER AS $$ BEGIN PERFORM pg_notify('purr', 'type_change'); RETURN NULL; END; $$ LANGUAGE plpgsql; """ % name try: logger.info("Creating procedure: %s" % name, CURR_FILE) db.execute_cmd(cmd) except Exception as ex: logger.error("Insert failed: %s" % ex, CURR_FILE)
def create_stat_table(db, schema='public'): """ Creates a table that holds the timestamp of the latest successfully inserted item. Parameters ---------- Returns ------- - Example ------- create_stat_table(pg, 'purr') """ table_name = "purr_info" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [0, None, int(time.time())] try: table.create(db, schema, table_name, attrs, types) ts = get_latest_successful_ts(db, schema) if len(ts) == 0: row.insert(db, schema, table_name, attrs, values) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def log_error(db, values, schema='public'): """ Insert the number, relation name, timestamp for each collection transfer. Parameters ---------- Returns ------- - Example ------- log_stats(pg, 'purr', []) """ table_name = "purr_error" # id is SERIAL type, we can skip it when inserting rows: attrs = table_desc[table_name]["attrs"][1:] try: row.insert(db, schema, table_name, attrs, values) except Exception as ex: logger.error("""[TRANSFER_INFO] Failed to insert logs into table %s: %s""" % (table_name, ex))
def insert_multiple(self, docs, r, coll): ''' Transfers multiple documents with different fields (not whole collections). Used by [TAILER] Parameters ---------- doc : dict : document r : Relation relation in PG coll : string : collection name Returns ------- - Raises ------ Example ------- ''' ( att_new, att_orig, types, name_rel, type_x_props_pg ) = cp.config_fields(self.coll_def, coll) # TODO: check if this is necessary: if types == []: return # Adding extra properties to inserted/updated row is necessary # because this attribute is not part of the original document # and anything that is not defined in the collection.yml file # will be pushed in this value. This function will also create # a dictionary which will contain all the information about # the attribute before and after the conversion. self.attr_details = self.prepare_attr_details( att_new, att_orig, types, type_x_props_pg) # TODO remove this stuff with the extra props try: if self.include_extra_props is True: r.insert_bulk( docs, self.attr_details, self.include_extra_props) else: r.insert_bulk_no_extra_props_tailed( docs, self.attr_details, self.include_extra_props) except Exception as ex: logger.error(""" %s Transferring to %s was unsuccessful. Exception: %s """ % ( CURR_FILE, r.relation_name, ex)) logger.error("%s\n" % docs)
def get_latest_successful_ts(db, schema='public'): """ Get the timestamp of the latest successful transfer. Parameters ---------- Returns ------- - Example ------- get_latest_successful_ts(pg, 'purr') """ table_name = 'purr_info' cmd = "SELECT latest_successful_ts FROM %s.%s;" % (schema, table_name) try: res = db.execute_cmd_with_fetch(cmd) return res except Exception as ex: logger.error( """[TRANSFER_INFO] Failed to get the timestamp of the latest successful transfer: %s""" % (ex) )
def drop(db, schema, tables): """ Drop one or more tables in the PG database. Parameters ---------- schema : string tables : list Example ------- drop(pg, 'public', ['my_table']) Todo ---- - first check if all tables in the list exist """ tables_cmd = [] for t in tables: tables_cmd.append('%s.%s' % (schema, t.lower())) tables_cmd = ', '.join(tables_cmd) cmd = "DROP TABLE IF EXISTS %s" % (tables_cmd) try: db.execute_cmd(cmd) logger.info('[TABLE] Dropping table(s) %s.' % (tables_cmd)) except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def create(db, schema, name, attrs, types, pks=["id"]): """ Creates a table in Postgres. Parameters ---------- name : str TODO ---- """ attrs_and_types = [] for i in range(len(attrs)): pair = '"%s" %s' % (attrs[i], types[i]) attrs_and_types.append(pair) pks = [('"%s"' % pk) for pk in pks] primary_keys = "PRIMARY KEY (%s)" % ",".join(pks) attrs_and_types.append(primary_keys) attrs_and_types = ", ".join(attrs_and_types) name = name.lower() cmd = "CREATE TABLE IF NOT EXISTS %s.%s(%s);" % (schema, name, attrs_and_types) try: db.execute_cmd(cmd) except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def create_oplog_table(db, schema='public'): """ Logs the operation, relation name, object id and timestamp for each entry of the oplog. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_oplog_table(pg, 'purr') """ table_name = "purr_oplog" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] pks = table_desc[table_name]["pks"] values = [int(time.time())] try: table.drop(db, schema, [table_name]) table.create(db, schema, table_name, attrs, types, pks) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def vacuum(db, schema, table): cmd = "VACUUM FULL ANALYZE %s.%s;" % (schema, table) try: logger.info("[TABLE] Vacuuming table '%s.%s'" % (schema, table)) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Vacuuming table '%s.%s' failed: %s" % (schema, table, ex))
def get_doc_by_id(db, name, id): try: c = db[name] bz = c.find_one({"_id": ObjectId(id)}) return bz except Exception as ex: logger.error(""" [COLLECTION] Loading document from collection %s failed. Details: %s""" % (name, ex))
def start(extractor, coll_config): collections = cp.config_collection_names(coll_config) if collections is None: logger.error(""" [TRANSFER] No collections found. Check your collection names in the setup file. """) return extractor.transfer(collections)
def create_trigger_type_notification(db, schema, table, name, proc): cmd = """ CREATE TRIGGER %s AFTER INSERT OR UPDATE OR DELETE ON %s.%s FOR EACH ROW EXECUTE PROCEDURE %s() """ % (name, schema, table, proc) try: logger.info("[TABLE] Creating trigger '%s'" % name) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Creating trigger '%s' failed: %s" % (name, ex))
def __init__(self, settings): db_name = settings['db_name'] try: self.client = pymongo.MongoClient(settings['connection']) except Exception as ex: logger.error("Could not initialize MongoDB client: %s" % ex) try: self.conn = self.client[db_name] except Exception as ex: logger.error("Could not create connection to MongoDB: %s" % ex)
def update_latest_successful_ts(db, schema, dt): cmd = "UPDATE %s.purr_info SET latest_successful_ts='%s';" % ( schema, str(dt)) try: db.execute_cmd(cmd) except Exception as ex: logger.error( """[TRANSFER_INFO] Failed to update the timestamp of the latest successful transfer: %s""" % (ex) )
def get_table(db, schema='public'): cmd = """SELECT id, collection_name, relation_name, types FROM %s.purr_collection_map ORDER BY id""" % (schema) try: coll_map = db.execute_cmd_with_fetch(cmd) logger.info("Getting schema from DB.", CURR_FILE) return coll_map except Exception as ex: logger.error("[TRANSFER_INFO] Failed to get collection map table" % (ex))
def reset(db, schema='public'): """ Reset existing schema or create a new one. """ drop = 'DROP SCHEMA IF EXISTS %s CASCADE;' % schema create = 'CREATE SCHEMA %s;' % schema try: db.execute_cmd(drop) db.execute_cmd(create) logger.info("[SCHEMA] Schema %s is reset." % schema) except Exception as ex: logger.error("[SCHEMA] Schema reset failed. %s" % ex)
def create(db, schema='public'): """ Create schema if it does not exist. """ cmd = 'CREATE SCHEMA IF NOT EXISTS %s;' % (schema) try: db.execute_cmd(cmd) except Exception as ex: logger.error(""" [SCHEMA] Creating schema with name %s failed. Details: %s """ % (schema, ex))
def create_file(coll_map): """ Creates the collection map file. """ name_file = "collections.yml" operation = "w" try: logger.info("%s Creating collection map file..." % CURR_FILE) with open(name_file, operation) as file_out: yaml.dump(coll_map, file_out, default_flow_style=False) logger.info("Collection map file created: %s" % name_file, CURR_FILE) except Exception as ex: logger.error("Failed to create collection map file. Details: %s" % ex, CURR_FILE)
def log_tailed_docs(pg, schema, docs_useful, ids_log, table_name, oper, merged): log_entries = [] ts = time.time() logger.info("IDs: %s" % ids_log) if len(ids_log) != len(docs_useful) and oper != DELETE: logger.error("n(ids)=%s; n(docs_useful)=%s" % (len(ids_log), len(docs_useful))) for i in range(len(docs_useful)): id = ids_log[i] doc = "no entry" try: if docs_useful[i] is not None and oper != DELETE: doc = str(docs_useful[i]) else: doc = "Doc is NULL" except Exception as ex: logger.error( "%s Converting log entry failed. Details: %s\n Document: " % (CURR_FILE, ex)) logger.error(docs_useful[i]) row = [oper, table_name, id, ts, merged, doc] log_row = tuple(row) log_entries.append(log_row) try: transfer_info.log_rows(pg, schema, log_entries) except Exception as ex: logger.error("%s Logging failed. Details: %s" % (CURR_FILE, ex))
def coll_in_map(self, name): ''' name: string; name of collection as 'name_db.name_coll', e.g. 'cat_db.Breeds' Checks if a collection exists in collections.yml. ''' coll = name.split(".")[1] try: if coll in self.coll_settings.keys(): return True else: return False except Exception as ex: logger.error("%s Details %s" % (CURR_FILE, ex)) return False
def check(db, colls_requested): """ Checks if requested collections exist in the database. Gets all collection names from MongoDB (colls_name) and creates a new list which contains only the existing collection names. Parameters ---------- db : pymongo.database.Database Database connection and name colls_requested : list Contains the list of requested collection names. Returns ------- colls_existing : list Contains only existing collection names. Raises ------ ValueError when a requested collection does not exist in the database (MongoDB) Example ------- check(db, ['Car']) [] check(db, ['Region', 'Customer'] ['Region', 'Customer'] """ colls_name = db.collection_names(include_system_collections=False) colls_existing = [] logger.info('[COLLECTION] Checking collection names...') try: for coll in colls_requested: try: colls_name.index(coll) colls_existing.append(coll) except ValueError: logger.warn(""" [COLLECTION] '%s' is not in the Mongo database. Skipping data transfer""" % coll) except Exception as ex: logger.error("[COLLECTION] Checking collection names failed: %s" % ex) return colls_existing
def get_by_name_reduced(db, name, fields, size=20000): """ Gets data from collection limited by batch size containing only specific fields. Parameters ---------- db : pymongo.database.Database Database connection and name name : string Name of collection. fields : list Names of fields to include in the query. Returns ------- docs : pymongo.cursor.Cursor Raises ------ Example ------- get_by_name(db, 'Car', ['_id', 'type', 'nfOfSeats']) TODO ---- - let the user decide batch size """ docs = [] try: logger.info('[COLLECTION] Loading data from collection %s...' % name) c = db[name] # create the document given to a query that specifies which # fields MongoDB returns in the result set projection = {} for field in fields: projection[field] = 1 bz = c.find({}, projection).sort('$natural', pymongo.DESCENDING) docs = bz.batch_size(size) except Exception as ex: logger.error("""[COLLECTION] Loading data from collection %s failed. Details: %s""" % (name, ex)) return docs