def table_track(self, coll_map_cur, coll_map_new): """ coll_map_cur : list : current collection map coll_map_new : list : new collection map Update the extractor object's collection map and starts tracking collections (data transfer). TODO: take care of extra props type (JSONB) """ logger.info("%s Adding new collection" % CURR_FILE) colls_cur = [x[1] for x in coll_map_cur] colls_new = [x[1] for x in coll_map_new] colls_to_add = [x for x in colls_new if x not in colls_cur] for name_coll in colls_to_add: coll_def = [x for x in coll_map_new if x[1] == name_coll][0] columns = coll_def[3] name_rel = coll_def[2] type_extra_prop = 'JSONB' meta = { ':table': name_rel, ':extra_props': type_extra_prop } self.coll_def[name_coll] = { ':columns': columns, ':meta': meta, } self.transfer_coll(name_coll)
def drop_type_notification(db, name): """ Drops the function which creates a notification. The notification created by this function is sent to channel 'purr' and signalizes a type change. Parameters ---------- db : obj : Postgres connection object name: sting : name of the function Returns ------- - Example ------- drop_notification(db, function_name) """ cmd = "DROP FUNCTION IF EXISTS %s();" % name try: db.execute_cmd(cmd) logger.info("Dropping procedure: %s" % name, CURR_FILE) except Exception as ex: logger.error("Dropping procedure failed: %s" % ex, CURR_FILE)
def log_tailed_docs(pg, schema, docs_useful, ids_log, table_name, oper, merged): log_entries = [] ts = time.time() logger.info("IDs: %s" % ids_log) if len(ids_log) != len(docs_useful) and oper != DELETE: logger.error("n(ids)=%s; n(docs_useful)=%s" % (len(ids_log), len(docs_useful))) for i in range(len(docs_useful)): id = ids_log[i] doc = "no entry" try: if docs_useful[i] is not None and oper != DELETE: doc = str(docs_useful[i]) else: doc = "Doc is NULL" except Exception as ex: logger.error( "%s Converting log entry failed. Details: %s\n Document: " % (CURR_FILE, ex)) logger.error(docs_useful[i]) row = [oper, table_name, id, ts, merged, doc] log_row = tuple(row) log_entries.append(log_row) try: transfer_info.log_rows(pg, schema, log_entries) except Exception as ex: logger.error("%s Logging failed. Details: %s" % (CURR_FILE, ex))
def delete(db, schema, table_name, ids): """ Deletes a row in a specific table of the PG database. Parameters ---------- table_name : string object_id : ObjectId (will need to get the hex encoded version of ObjectId with str(object_id)) Returns ------- - Example ------- delete(db, 'public', 'employee', "5acf593eed101e0c1266e32b") """ oids = "','".join(ids) cmd = "DELETE FROM %s.%s WHERE id IN ('%s');" % (schema, table_name.lower(), oids) logger.info("[ROW] %s" % cmd) db.execute_cmd(cmd)
def create_oplog_table(db, schema='public'): """ Logs the operation, relation name, object id and timestamp for each entry of the oplog. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_oplog_table(pg, 'purr') """ table_name = "purr_oplog" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] pks = table_desc[table_name]["pks"] values = [int(time.time())] try: table.drop(db, schema, [table_name]) table.create(db, schema, table_name, attrs, types, pks) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def transfer(self, coll_names_in_config): """ Transfers documents or whole collections if the number of fields is less than 30 000 (batch_size). Types of attributes are determined using the collections.yml file. Returns ------- - Parameters ---------- coll_names : list : list of collection names """ coll_names = collection.check(self.mdb, coll_names_in_config) if len(coll_names) == 0: logger.info('%s No collections to transfer.' % CURR_FILE ) return relation_names = [] for coll in coll_names: relation_names.append(tc.snake_case(coll)) if self.drop: table.drop(self.pg, self.schema, relation_names) elif self.truncate: table.truncate(self.pg, self.schema, coll_names) schema.create(self.pg, self.schema) for coll in coll_names: self.transfer_coll(coll)
def create_transfer_stats_table(db, schema='public'): """ Logs the number, relation name, timestamp for each collection transfer. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_transfer_stats_table(pg, 'purr') """ table_name = "purr_transfer_stats" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [int(time.time())] try: table.create(db, schema, table_name, attrs, types) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def create_stat_table(db, schema='public'): """ Creates a table that holds the timestamp of the latest successfully inserted item. Parameters ---------- Returns ------- - Example ------- create_stat_table(pg, 'purr') """ table_name = "purr_info" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [0, None, int(time.time())] try: table.create(db, schema, table_name, attrs, types) ts = get_latest_successful_ts(db, schema) if len(ts) == 0: row.insert(db, schema, table_name, attrs, values) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def create_log_error_table(db, schema='public'): """ Logs the error's location, message and timestamp when an it occurs. Parameters ---------- db: connection obj schema: name of the schema in Postgres Returns ------- - Example ------- create_log_error_table(pg, 'purr') """ table_name = "purr_error" attrs = table_desc[table_name]["attrs"] types = table_desc[table_name]["types"] values = [int(time.time())] try: table.create(db, schema, table_name, attrs, types) logger.info("[TRANSFER INFO] Created table %s." % (table_name)) except Exception as ex: logger.error( "[TRANSFER_INFO] Failed to create table %s: %s" % (table_name, ex))
def drop(db, schema, tables): """ Drop one or more tables in the PG database. Parameters ---------- schema : string tables : list Example ------- drop(pg, 'public', ['my_table']) Todo ---- - first check if all tables in the list exist """ tables_cmd = [] for t in tables: tables_cmd.append('%s.%s' % (schema, t.lower())) tables_cmd = ', '.join(tables_cmd) cmd = "DROP TABLE IF EXISTS %s" % (tables_cmd) try: db.execute_cmd(cmd) logger.info('[TABLE] Dropping table(s) %s.' % (tables_cmd)) except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def drop_trigger_type_notification(db, schema, table, name, proc): cmd = "DROP TRIGGER IF EXISTS %s ON %s.%s CASCADE" % (name, schema, table) try: logger.info("[TABLE] Dropping trigger '%s'" % name) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Dropping trigger '%s' failed: %s" % (name, ex))
def create_type_notification(db, name): """ Creates a function which will notify channel 'purr' about type changes. Parameters ---------- db : obj : Postgres connection object name: sting : name of the function Returns ------- - Example ------- create_notification(db, name_function) """ cmd = """CREATE OR REPLACE FUNCTION %s() RETURNS TRIGGER AS $$ BEGIN PERFORM pg_notify('purr', 'type_change'); RETURN NULL; END; $$ LANGUAGE plpgsql; """ % name try: logger.info("Creating procedure: %s" % name, CURR_FILE) db.execute_cmd(cmd) except Exception as ex: logger.error("Insert failed: %s" % ex, CURR_FILE)
def vacuum(db, schema, table): cmd = "VACUUM FULL ANALYZE %s.%s;" % (schema, table) try: logger.info("[TABLE] Vacuuming table '%s.%s'" % (schema, table)) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Vacuuming table '%s.%s' failed: %s" % (schema, table, ex))
def create_trigger_type_notification(db, schema, table, name, proc): cmd = """ CREATE TRIGGER %s AFTER INSERT OR UPDATE OR DELETE ON %s.%s FOR EACH ROW EXECUTE PROCEDURE %s() """ % (name, schema, table, proc) try: logger.info("[TABLE] Creating trigger '%s'" % name) db.execute_cmd(cmd) except Exception as ex: logger.error("[TABLE] Creating trigger '%s' failed: %s" % (name, ex))
def prepare_docs_for_update(coll_settings, docs): docs_useful = [] docs_id = [] for doc in docs: # It is possible that multiple versions of one document # exist among these documents. they must be merged so they # can be sent Postgres together as one entry. merge_similar = False unset = {} doc_useful = {} temp = doc["o"] if "o2" in doc.keys(): if "_id" in doc["o2"].keys(): doc_useful["_id"] = str(doc["o2"]["_id"]) if (doc_useful["_id"] in docs_id): merge_similar = True else: docs_id.append(str(doc_useful["_id"])) if "$set" in temp.keys(): doc_useful.update(temp["$set"]) for k, v in temp["$set"].items(): if v is None: unset[k] = "$unset" if "$unset" in temp.keys(): for k, v in temp["$unset"].items(): unset[k] = '$unset' if "$set" not in temp.keys() and "$unset" not in temp.keys(): # case when the document was not updated # using a query, but the IDE e.g. Studio3T: logger.info("Direct update:") doc_useful.update(temp) fields = [x[":source"] for x in coll_settings[":columns"]] for k in fields: if k == '_id': temp[k] = str(temp[k]) doc_useful.update(temp) if k not in temp.keys(): unset[k] = '$unset' for k, v in temp.items(): if v is None: unset[k] = '$unset' doc_useful.update(unset) # merging values with the same ID because there cannot be # multiple updates of the same row in one statement if merge_similar is True: for i in range(0, len(docs_useful)): if docs_useful[i]["_id"] == doc_useful["_id"]: docs_useful[i] = dict(docs_useful[i], **doc_useful) break else: docs_useful.append(doc_useful) return docs_useful, merge_similar
def get_table(db, schema='public'): cmd = """SELECT id, collection_name, relation_name, types FROM %s.purr_collection_map ORDER BY id""" % (schema) try: coll_map = db.execute_cmd_with_fetch(cmd) logger.info("Getting schema from DB.", CURR_FILE) return coll_map except Exception as ex: logger.error("[TRANSFER_INFO] Failed to get collection map table" % (ex))
def reset(db, schema='public'): """ Reset existing schema or create a new one. """ drop = 'DROP SCHEMA IF EXISTS %s CASCADE;' % schema create = 'CREATE SCHEMA %s;' % schema try: db.execute_cmd(drop) db.execute_cmd(create) logger.info("[SCHEMA] Schema %s is reset." % schema) except Exception as ex: logger.error("[SCHEMA] Schema reset failed. %s" % ex)
def generate_collection_map(settings_mdb): """ TODO: - add docs - disconnect from Mongo! """ logger.info("Starting Purrito v%s ... =^..^=" % get_version(), CURR_FILE) logger.info("PID=%s" % os.getpid(), CURR_FILE) mongo = mongodb.MongoConnection(settings_mdb) coll_map = cm.create_map(mongo.conn, settings_mdb["db_name"]) cm.create_file(coll_map) mongo.disconnect()
def create_file(coll_map): """ Creates the collection map file. """ name_file = "collections.yml" operation = "w" try: logger.info("%s Creating collection map file..." % CURR_FILE) with open(name_file, operation) as file_out: yaml.dump(coll_map, file_out, default_flow_style=False) logger.info("Collection map file created: %s" % name_file, CURR_FILE) except Exception as ex: logger.error("Failed to create collection map file. Details: %s" % ex, CURR_FILE)
def table_untrack(self, coll_map_cur, coll_map_new): tables_cur = [x[2] for x in coll_map_cur] tables_remaining = [x[2] for x in coll_map_new] tables_to_drop = [x for x in tables_cur if x not in tables_remaining] colls_to_remove = [x[1] for x in coll_map_cur if x[2] in tables_to_drop] logger.info("%s Stop syncing collections %s." % ( CURR_FILE, ", ".join(colls_to_remove))) for coll in colls_to_remove: self.coll_def.pop(coll, None)
def handle_multiple(self, docs, updated_at): # group by name docs_grouped = {} for doc in docs: collection = doc["ns"]["db"] + "." + doc["ns"]["coll"] if collection not in docs_grouped.keys(): docs_grouped[collection] = [] useful_info_update = {} if doc["operationType"] == UPDATE: if "updateDescription" in doc.keys(): useful_info_update = doc["updateDescription"] elif doc["operationType"] == INSERT: if "fullDocument" in doc.keys(): useful_info_update = doc["fullDocument"] else: useful_info_update = doc["o2"] set_dict = {} d = { "op": doc["operationType"], "db_name": doc["ns"]["db"], "coll_name": doc["ns"]["coll"], "o": doc["documentKey"], "_id": doc["documentKey"]["_id"], "o2": useful_info_update } if doc["operationType"] == UPDATE: d["o"]["$set"] = useful_info_update['updatedFields'] d["o"]["_id"] = doc["documentKey"]["_id"] if doc["operationType"] == INSERT: d["o"] = useful_info_update d["o2"]["_id"] = doc["documentKey"]["_id"] docs_grouped[collection].append(d) for coll, docs_details in docs_grouped.items(): self.transform_and_load_many(docs_details) # every 5 minutes update the timestamp because we need to continue # tailing in case of disconnecting from the PGDB diff = datetime.utcnow() - updated_at minutes_between_update = (diff.seconds // 60) % 60 if minutes_between_update > 5: t = int(datetime.utcnow().timestamp()) transfer_info.update_latest_successful_ts( self.pg, self.schema, t) logger.info("%s Updated latest_successful_ts: %d" % (CURR_FILE, t))
def create_map(mongo, name_db): coll_map = {name_db: {}} colls = collection.get_all(mongo) for coll in sorted(colls): logger.info('Determining types for collection %s...' % coll, CURR_FILE) # TODO: replace snake_case to another file e.g util name_relation = tc.snake_case(coll) coll_map[name_db][coll] = { ":columns": [], ':meta': { ':table': name_relation, ':extra_props': 'JSONB' } } docs = collection.get_docs_for_type_check(mongo, coll) logger.info("Reading samples...", CURR_FILE) types = get_types(docs) # TODO: handle None for field, value in types.items(): type_chosen = "text" if len(field) > 1: sum = docs.count() max = 0 for k, v in value.items(): if k is None: continue curr_perc = v / sum if curr_perc > 0 and curr_perc > max: max = curr_perc type_chosen = k else: # there is exactly one key which will be the # chosen type type_chosen = list(value.keys())[0] name_column = tc.snake_case(field) def_column = { name_column: None, ":source": field, ":type": type_chosen.upper() } coll_map[name_db][coll][":columns"].append(def_column) return coll_map
def __init__(self, conn_details, ttw=1): logger.info("Connecting to %s" % conn_details, CURR_FILE) # time to wait before attempt to reconnect self.ttw = ttw self.conn_details = conn_details self.cmd_latest = None self.values_latest = None self.function_latest = None self.query_failed = False if ttw == 1: self.attempt_to_reconnect = False try: self.conn = psycopg2.connect(self.conn_details) self.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) self.cur = self.conn.cursor() logger.info("Connected to Postgres.", CURR_FILE) self.ttw = 1 if self.query_failed is True: """ the latest command should be repeated because Postgres was disconnected and it is likely that it was during a command """ self.query_failed = False if self.function_latest is FunctionLatest.EXECUTE: self.execute_cmd(self.cmd_latest, self.values_latest) elif self.function_latest is FunctionLatest.EXECUTE_WITH_FETCH: self.execute_cmd_with_fetch(self.cmd_latest, self.values_latest) elif self.function_latest is FunctionLatest.EXECUTE_MANY: self.execute_many_cmd(self.cmd_latest, self.values_latest) except Exception as ex: self.attempt_to_reconnect = True msg = """ Could not connect to Postgres. Reconnecting in %s seconds... Details: %s """ % (self.ttw, ex) self.log_error_in_pg(msg) time.sleep(self.ttw) self.__init__(self.conn_details, self.ttw * 2)
def check(db, colls_requested): """ Checks if requested collections exist in the database. Gets all collection names from MongoDB (colls_name) and creates a new list which contains only the existing collection names. Parameters ---------- db : pymongo.database.Database Database connection and name colls_requested : list Contains the list of requested collection names. Returns ------- colls_existing : list Contains only existing collection names. Raises ------ ValueError when a requested collection does not exist in the database (MongoDB) Example ------- check(db, ['Car']) [] check(db, ['Region', 'Customer'] ['Region', 'Customer'] """ colls_name = db.collection_names(include_system_collections=False) colls_existing = [] logger.info('[COLLECTION] Checking collection names...') try: for coll in colls_requested: try: colls_name.index(coll) colls_existing.append(coll) except ValueError: logger.warn(""" [COLLECTION] '%s' is not in the Mongo database. Skipping data transfer""" % coll) except Exception as ex: logger.error("[COLLECTION] Checking collection names failed: %s" % ex) return colls_existing
def get_by_name_reduced(db, name, fields, size=20000): """ Gets data from collection limited by batch size containing only specific fields. Parameters ---------- db : pymongo.database.Database Database connection and name name : string Name of collection. fields : list Names of fields to include in the query. Returns ------- docs : pymongo.cursor.Cursor Raises ------ Example ------- get_by_name(db, 'Car', ['_id', 'type', 'nfOfSeats']) TODO ---- - let the user decide batch size """ docs = [] try: logger.info('[COLLECTION] Loading data from collection %s...' % name) c = db[name] # create the document given to a query that specifies which # fields MongoDB returns in the result set projection = {} for field in fields: projection[field] = 1 bz = c.find({}, projection).sort('$natural', pymongo.DESCENDING) docs = bz.batch_size(size) except Exception as ex: logger.error("""[COLLECTION] Loading data from collection %s failed. Details: %s""" % (name, ex)) return docs
def run_collection(conn, coll, stop): cursor = conn["booster"][coll].watch([{ '$match': { 'operationType': { '$in': [UPDATE, INSERT, DELETE] } } }]) for doc in cursor: col = doc["ns"]["db"] + "." + doc["ns"]["coll"] op = doc["operationType"] if op in [INSERT, UPDATE, DELETE]: DATA_QUEUE.put(doc) if stop(): break logger.info("%s Thread %s stopped." % (CURR_FILE, coll)) cursor.close()
def get_column_names_and_types(db, schema, table): """ Get column names and column types of a specific table. Parameters ---------- table_name: str Returns ------- List of column names and corresponding types. """ cmd = """ SELECT column_name, data_type FROM information_schema.columns WHERE table_schema='%s' AND table_name = '%s'; """ % (schema, table.lower()) logger.info("[TABLE] Checking columns and types for table %s.%s" % (schema, table)) try: rows = db.execute_cmd_with_fetch(cmd) return rows except Exception as ex: logger.error('[TABLE] %s when executing command %s.' % (ex, cmd))
def convert_columns(self, name_table, source, fields_cur, fields_new): """ (1) Tries to convert the column (2) TODO: If (1) was not successful (PG could not convert the column), just rename it and add the column again so Purr can take care of it """ for i in range(0, len(fields_new)): field = fields_new[i] if field[":source"] in source: for column, v in field.items(): if v is None: type_old = fields_cur[i][":type"] type_new = field[":type"] if tc.is_convertable(type_old, type_new): logger.info( """%s table %s, column %s: Type [%s] is convertable to [%s]""" % ( CURR_FILE, name_table, column, type_old, type_new )) table.column_change_type( self.pg, self.schema, name_table, column, type_new) else: logger.error(""" %s In table %s, column %s: Type [%s] is NOT convertable to [%s] """ % ( CURR_FILE, name_table, column, type_old, type_new))
def get_docs_for_type_check(db, name, nr_of_docs=100): """ Gets data from a collection limited. Parameters ---------- db : pymongo.database.Database Database connection name : string Name of collection. nr_of_docs : integer Number of documents to return Returns ------- docs : pymongo.cursor.Cursor Raises ------ Example ------- get_docs_for_type_check(db, 'Car') TODO ---- - let the user decide batch size """ docs = [] try: logger.info('[COLLECTION] Loading data from collection %s...' % name) c = db[name] docs = c.find().sort('$natural', pymongo.DESCENDING).skip(0).limit(nr_of_docs) except Exception as ex: logger.error(""" [COLLECTION] Loading data from collection %s failed. Details: %s""" % (name, ex)) return docs
def create_table(db, coll_map, schema='public'): """ Adds primary key to a PostgreSQL table. Parameters ---------- Returns ------- - Example ------- create_table(pg, 'purr') """ table_name = "purr_collection_map" attrs = [ "id", "collection_name", "relation_name", "types", "updated_at", "query_update" ] types = ["integer", "text", "text", "jsonb[]", "timestamp", "text"] try: # TODO: make this fucntion accept string and list table.drop(db, schema, [table_name]) table.create(db, schema, table_name, attrs, types) logger.info("Created table %s.%s." % (schema, table_name), CURR_FILE) except Exception as ex: logger.error( """ Failed to create table %s.%s: %s """ % (schema, table_name, ex), CURR_FILE) populate_table(db, coll_map, table_name, attrs, schema) procedure_name = 'notify_type' procedure.drop_type_notification(db, procedure_name) procedure.create_type_notification(db, procedure_name) table.drop_trigger_type_notification(db, 'public', 'purr_collection_map', 'notify', procedure_name) table.create_trigger_type_notification(db, 'public', 'purr_collection_map', 'notify', procedure_name)