class Transformer: AVAILABLE_STRATEGIES = [ 'remove_undefined_user_orders', 'keep_undefined_user_orders' ] def __init__(self, worker_name): self.logger = Logger(worker_name) def merge_users_and_orders(self, users, orders, strategy='remove_undefined_user_orders'): user_orders = [] users_dict = {} try: if strategy not in self.AVAILABLE_STRATEGIES: raise Exception('Undefined merging strategy') for user in users: user_id = user['user_id'] del user['user_id'] users_dict[user_id] = {f'user_{k}': v for k, v in user.items()} for order in orders: user_id = order['user_id'] user_order = order if user_id in users_dict.keys(): user_data = users_dict[user_id] user_order.update(user_data) user_orders.append(user_order) elif strategy == 'keep_undefined_user_orders': user_orders.append(user_order) return user_orders except Exception as error: self.logger.error(error) raise error
class Loader: def __init__(self, worker_name): self.logger = Logger(worker_name) def load(self, user_orders): connection = None sql_command = getInsert() try: connection = psycopg2.connect(user=Config.POSTGRES_USER, password=Config.POSTGRES_PASSWORD, host=Config.POSTGRES_HOST, port=Config.POSTGRES_PORT, database=Config.POSTGRES_DATABASE) cursor = connection.cursor() cursor.executemany(sql_command, user_orders) cursor.close() connection.commit() except (Exception, psycopg2.DatabaseError) as error: self.logger.error('PostgreSQL error') raise error finally: if connection is not None: connection.close()
class Extractor: LAST_EXECUTION_TIME_FIELD = 'last_execution_time' client = None db = None def __init__(self, worker_name): self.logger = Logger(worker_name) self.connect() def connect(self): try: self.client = MongoClient(Config.MONGODB_HOST, Config.MONGODB_PORT) self.db = self.client[Config.MONGODB_DB] except Exception as error: self.logger.error('MongoDB connection error') raise error def extract_users(self, collection_name): valid_users_data_filter = {'user_id': {'$ne': 'null'}} collection = self.db[collection_name] result_cursor = collection.find(valid_users_data_filter, {'_id': 0}) return result_cursor def extract_orders(self, collection_name): valid_orders_data_filter = {'user_id': {'$ne': 'null'}} current_execution_timestamp = datetime.now().__str__() collection = self.db[collection_name] collection_registry_path = self.get_collection_registry_path( collection_name) if self.exists_collection_registry_file(collection_registry_path): collection_registry = self.read_collection_registry( collection_registry_path) last_execution_timestamp = collection_registry.get( self.LAST_EXECUTION_TIME_FIELD) documents_filter = { '$and': [{ 'updated_at': { '$gt': last_execution_timestamp } }, { 'updated_at': { '$lte': current_execution_timestamp } }] } else: documents_filter = { 'updated_at': { '$lte': current_execution_timestamp } } search_filter = {**valid_orders_data_filter, **documents_filter} result_cursor = collection.find(search_filter, {'_id': 0}) if result_cursor.count(): self.update_last_execution_timestamp(collection_registry_path, current_execution_timestamp) return result_cursor def get_collection_registry_path(self, collection_name): return f'{Config.REGISTRY_FILES_FOLDER}/{collection_name}.dict' def exists_collection_registry_file(self, collection_registry_path): return os.path.isfile(collection_registry_path) def read_collection_registry(self, collection_registry_path): with open(collection_registry_path, 'rb') as file: return pickle.load(file) def write_collection_registry(self, collection_registry_path, collection_registry): with open(collection_registry_path, 'wb') as file: pickle.dump(collection_registry, file) def update_last_execution_timestamp(self, collection_registry_path, current_execution_timestamp): collection_registry = { self.LAST_EXECUTION_TIME_FIELD: current_execution_timestamp } self.write_collection_registry(collection_registry_path, collection_registry)