def __init__(self, credential_path=None, **kwargs): if not credential_path: raise Exception('Credential path is required') super().__init__(**kwargs) credentials = Credentials.from_service_account_file( credential_path, scopes=GoogleDriveStorage.scopes) self.service = build('drive', 'v3', credentials=credentials, cache_discovery=False) self.current_working_folder_id = None self.fs = FileSystem()
def create_logger(): lation_logger = logging.getLogger() lation_logger.setLevel(logging.INFO) fs = FileSystem() dir_path = fs.create_directory(['logs']) # file handler filename = '{:%Y-%m-%d %H-%M-%S.%f}.log'.format(datetime.now()) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler = logging.FileHandler(os.path.join(dir_path, filename), 'a', 'utf-8') file_handler.setFormatter(formatter) lation_logger.addHandler(file_handler) # console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(formatter) lation_logger.addHandler(console_handler) return lation_logger
def __init__(self, url=None, dialect=None, driver=None, username=None, password=None, host=None, port=None, database=None, model_agnostic=False): from lation.core.orm import Base from lation.modules.base.models.lation_data import LationData if not url: url = f'{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}' echo = bool(DEBUG_SQL) self.engine = create_engine(url, pool_size=1, echo=echo, logging_name='lation.engine') SessionFactory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False) self.Session = scoped_session(SessionFactory) self.model_agnostic = model_agnostic existing_metadata = Database.get_metadata() existing_metadata.reflect(bind=self.engine, schema=APP) self.existing_metadata = existing_metadata self.metadata = Base.metadata self.fs = FileSystem() if not echo: self.logger = create_logger() self.Base = Base self.LationData = LationData self.lation_id_map = {}
class Vault(): @staticmethod def to_encrypted_name(name): return f'{name}.encrypted' @staticmethod def to_decrypted_name(name): return '.'.join(name.split('.')[:-1]) def __init__(self, password): self.ansible_vault = AnsibleVault(password) self.fs = FileSystem() def encrypt(self, src, dest=None): srcs = self.fs.deserialize_name(src) assert self.fs.is_file(srcs), 'src should be a file' if not dest: dests = srcs.copy() dests[-1] = Vault.to_encrypted_name(srcs[-1]) dest = self.fs.serialize_name(dests) with open(src, 'r') as input_file: raw_data = input_file.read() with open(dest, 'wb') as output_file: self.ansible_vault.dump(raw_data, output_file) def decrypt(self, src, dest=None): srcs = self.fs.deserialize_name(src) assert self.fs.is_file(srcs), 'src should be a file' if not dest: dests = srcs.copy() dests[-1] = Vault.to_decrypted_name(srcs[-1]) dest = self.fs.serialize_name(dests) with open(src, 'r') as input_file: encrypted_data = input_file.read() try: decrypted_data = self.ansible_vault.load(encrypted_data) except AnsibleVaultError as e: raise Exception('Decrypt failed') with open(dest, 'w') as output_file: output_file.write(decrypted_data)
class Database(): @staticmethod def get_metadata(): return MetaData(schema=APP) def __init__(self, url=None, dialect=None, driver=None, username=None, password=None, host=None, port=None, database=None, model_agnostic=False): from lation.core.orm import Base from lation.modules.base.models.lation_data import LationData if not url: url = f'{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}' echo = bool(DEBUG_SQL) self.engine = create_engine(url, pool_size=1, echo=echo, logging_name='lation.engine') SessionFactory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False) self.Session = scoped_session(SessionFactory) self.model_agnostic = model_agnostic existing_metadata = Database.get_metadata() existing_metadata.reflect(bind=self.engine, schema=APP) self.existing_metadata = existing_metadata self.metadata = Base.metadata self.fs = FileSystem() if not echo: self.logger = create_logger() self.Base = Base self.LationData = LationData self.lation_id_map = {} def get_session(self): try: session = self.Session() return session finally: session.close() @functools.lru_cache() def find_tablename_by_file_path(self, file_path): filename = os.path.basename(file_path) table_name = os.path.splitext(filename)[0] return table_name # def find_table_by_file_path(self, file_path): # table_name = self.find_tablename_by_file_path(file_path) # return self.metadata.tables[f'{APP}.{table_name}'] @functools.lru_cache() def find_model_class_by_tablename(self, tablename): for model_class in self.Base._decl_class_registry.values(): if hasattr(model_class, '__tablename__') and model_class.__tablename__ == tablename: return model_class return None def is_json_column(self, column): return isinstance(column.type, JSON) def is_json_attribute(self, attribute): return isinstance(attribute.columns[0].type, JSON) def is_boolean_attribute(self, attribute): return isinstance(attribute.columns[0].type, Boolean) def export(self, dir_path): self.fs.create_directory(self.fs.deserialize_name(dir_path)) for table in self.metadata.sorted_tables: self.logger.info(f'EXPORT TABLE `{table.name}`...') file_path = os.path.join(dir_path, f'{table.name}.csv') with open(file_path, 'w', newline='', encoding='utf-8') as csv_file: column_names = [column.name for column in table.columns] writer = csv.DictWriter(csv_file, fieldnames=column_names) writer.writeheader() rows = self.engine.execute(table.select()) for row in rows: row_dict = dict(row) writer.writerow(row_dict) self.logger.info(f'EXPORT TABLE `{table.name}` DONE') def drop_schema(self, schema_name): self.engine.execute(DropSchema(schema_name)) self.logger.info(f'DELETE SCHEMA `{schema_name}`') def create_schema(self, schema_name): self.engine.execute(CreateSchema(schema_name)) self.logger.info(f'CREATE SCHEMA `{schema_name}`') def drop_tables(self): self.existing_metadata.drop_all(self.engine) self.logger.info('DELETE ALL TABLES') # drop tables one by one def destruct_tables(self): for table in reversed(self.metadata.sorted_tables): session.execute(table.delete()) def create_tables(self): self.metadata.create_all(self.engine) self.logger.info(f'ALL TABLES CREATED') def dispose(self): self.engine.dispose() def install_data(self, module_name): from lation.core.orm import SingleTableInheritanceMixin for parent_module in modules[module_name].parent_modules: self.install_data(parent_module.name) start_time = time.time() partial_csv_file_paths = modules[module_name].config.data if len(partial_csv_file_paths) == 0: self.logger.info(f'[{module_name}] NO DATA, SKIPPED') return self.logger.info(f'[{module_name}] INSTALL DATA...') session = self.get_session() for partial_csv_file_path in partial_csv_file_paths: current_table_lation_id_map = {} current_table_lation_id_unflushed_instance_map = {} csv_file_path = os.path.join('lation', 'modules', module_name, partial_csv_file_path) tablename = self.find_tablename_by_file_path(csv_file_path) model_class = self.find_model_class_by_tablename(tablename) if not model_class: raise Exception(f'Table `{tablename}` does not exist') is_single_table_inherited = SingleTableInheritanceMixin in model_class.mro() inspector = inspect(model_class) json_type_attribute_names = [attr.key for attr in inspector.mapper.column_attrs if self.is_json_attribute(attr)] boolean_type_attribute_names = [attr.key for attr in inspector.mapper.column_attrs if self.is_boolean_attribute(attr)] self.logger.info(f'[{module_name}] INTO TABLE `{tablename}` FROM PATH `{csv_file_path}`') with open(csv_file_path, newline='', encoding='utf-8') as csv_file: reader = csv.DictReader(csv_file) for csv_data in reader: attribute_data = {} # resolve lation_id lation_id = csv_data.get('lation_id') if not lation_id: raise Exception(f'Attribute `lation_id` is required for csv file `{csv_file_path}`') lation_id_parts = lation_id.split('.') if len(lation_id_parts) < 2 or lation_id_parts[0] != module_name: lation_id = f'{module_name}.{lation_id}' attribute_data['lation_id'] = lation_id del csv_data['lation_id'] for attribute_name in csv_data.keys(): csv_value = csv_data[attribute_name] attribute_name_parts = attribute_name.split('/') if csv_value == '': continue # resolve primitive data type if len(attribute_name_parts) == 1: if attribute_name in json_type_attribute_names: attribute_data[attribute_name] = ast.literal_eval(csv_value) elif attribute_name in boolean_type_attribute_names: attribute_data[attribute_name] = (csv_value.lower() == 'true') else: attribute_data[attribute_name] = csv_value # resolve foreign key elif len(attribute_name_parts) == 2 and attribute_name_parts[1] == 'fk': foreign_lation_id_parts = csv_value.split('.') if len(foreign_lation_id_parts) < 2: foreign_lation_id = f'{module_name}.{csv_value}' elif len(foreign_lation_id_parts) == 2: foreign_lation_id = csv_value elif len(foreign_lation_id_parts) > 2: raise NotImplementedError # foreign_instance = session.query(model_class).filter(model_class.lation_id == foreign_lation_id).one_or_none() foreign_instance_id = self.lation_id_map[foreign_lation_id] if not foreign_instance_id: raise Exception(f'Foreign lation_id `{foreign_lation_id}` not found') attribute_data[attribute_name_parts[0]] = foreign_instance_id else: raise NotImplementedError # correct model_class for single table inheritence if is_single_table_inherited: polymorphic_identity = attribute_data.get(inspector.polymorphic_on.key) model_class = inspector.polymorphic_map.get(polymorphic_identity).class_manager.class_ # upsert instance instance = session.query(model_class).filter(model_class.lation_id == lation_id).one_or_none() if instance: for attribute_name in attribute_data: setattr(instance, attribute_name, attribute_data[attribute_name]) current_table_lation_id_map[lation_id] = instance.id # self.lation_id_map[lation_id] = instance.id else: instance = model_class(**attribute_data) session.add(instance) current_table_lation_id_unflushed_instance_map[lation_id] = instance # flush to get instance ids session.flush() # refresh id for lation_id in current_table_lation_id_unflushed_instance_map: instance = current_table_lation_id_unflushed_instance_map[lation_id] # self.lation_id_map[lation_id] = instance.id current_table_lation_id_map[lation_id] = instance.id self.lation_id_map.update(current_table_lation_id_map) # rebalance lation data LationData = self.LationData lation_data = session.query(LationData).filter(LationData.model == tablename).all() deleted_ids = [] for ld in lation_data: if not self.lation_id_map.get(ld.model_lation_id): deleted_ids.append(ld.model_id) if len(deleted_ids) > 0: self.logger.info(f'[{module_name}] DELETE IDs {deleted_ids} FROM TABLE `{tablename}`...') instances = session.query(model_class).filter(model_class.id.in_(deleted_ids)).all() for instance in instances: session.delete(instance) session.query(LationData).filter(LationData.model == tablename).delete() for current_table_lation_id in current_table_lation_id_map: lation_data = LationData(model=tablename, model_lation_id=current_table_lation_id, model_id=current_table_lation_id_map[current_table_lation_id]) session.add(lation_data) # fix postgres sequence, see <https://stackoverflow.com/a/37972960/2443984> if session.bind.dialect.name == 'postgresql': for table in inspector.tables: session.execute(f'SELECT setval(pg_get_serial_sequence(\'{table.fullname}\', \'id\'), coalesce(max(id)+1, 1), false) FROM {table.fullname};') session.flush() self.logger.info(f'[{module_name}] FLUSHED') self.logger.info(f'[{module_name}] COMMIT...') session.commit() self.logger.info(f'[{module_name}] COMMITTED') self.logger.info(f'[{module_name}] INSTALL DATA DONE IN {time.time() - start_time}s') def reset(self): if self.engine.dialect.has_schema(self.engine, schema=APP): self.drop_tables() self.drop_schema(APP) self.create_schema(APP) self.create_tables() self.install_data(APP)
def __init__(self, password): self.ansible_vault = AnsibleVault(password) self.fs = FileSystem()
class GoogleDriveStorage(RemoteStorage): # https://developers.google.com/drive/api/v3/mime-types class MIMETypeEnum(enum.Enum): DOCUMENT = 'application/vnd.google-apps.document' # Google Docs DRAWING = 'application/vnd.google-apps.drawing' # Google Drawing FILE = 'application/vnd.google-apps.file' # Google Drive file FOLDER = 'application/vnd.google-apps.folder' # Google Drive folder PRESENTATION = 'application/vnd.google-apps.presentation' # Google Slides SPREAD_SHEET = 'application/vnd.google-apps.spreadsheet' # Google Sheets UNKNOWN = 'application/vnd.google-apps.unknown' # https://developers.google.com/drive/api/v3/about-auth scopes = [ 'https://www.googleapis.com/auth/drive', ] file_fields = ['id', 'name', 'mimeType', 'parents'] @staticmethod def is_google_doc_format(mime_type): return mime_type in [ GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value, GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value, GoogleDriveStorage.MIMETypeEnum.DRAWING.value, GoogleDriveStorage.MIMETypeEnum.PRESENTATION.value ] def __init__(self, credential_path=None, **kwargs): if not credential_path: raise Exception('Credential path is required') super().__init__(**kwargs) credentials = Credentials.from_service_account_file( credential_path, scopes=GoogleDriveStorage.scopes) self.service = build('drive', 'v3', credentials=credentials, cache_discovery=False) self.current_working_folder_id = None self.fs = FileSystem() def _list_all_items(self, query=None): page_token = None accumulated_files = [] if query == None: query = {} query_str = GoogleDriveUtility.get_query_str(**query) file_fields = ','.join(GoogleDriveStorage.file_fields) while True: response = self.service.files().list( q=query_str, fields=f'nextPageToken, files({file_fields})', pageToken=page_token).execute() files = response.get('files', []) page_token = response.get('nextPageToken', None) accumulated_files.extend(files) if page_token is None: return accumulated_files def _create_folder(self, folder_name, parent_folder_id=None): file_metadata = { 'name': folder_name, 'mimeType': GoogleDriveStorage.MIMETypeEnum.FOLDER.value, 'parents': parent_folder_id if isinstance(parent_folder_id, list) else [parent_folder_id], } file_fields = ','.join(GoogleDriveStorage.file_fields) folder = self.service.files().create(body=file_metadata, fields=file_fields).execute() return folder def _get_file_by_name(self, file_name, parent_folder_id=None): files = self._list_all_items({ 'name': file_name, 'parents': parent_folder_id, 'not_mime_type': GoogleDriveStorage.MIMETypeEnum.FOLDER.value, 'trashed': False, }) if len(files) == 0: raise Exception(f'File `{file_name}` does not exist') elif len(files) == 1: return files[0] else: raise Exception(f'Detect duplicate file `{file_name}`') def _get_folder_by_name(self, folder_name, parent_folder_id=None, create_on_not_exist=False): folders = self._list_all_items({ 'name': folder_name, 'parents': parent_folder_id, 'mime_type': GoogleDriveStorage.MIMETypeEnum.FOLDER.value, 'trashed': False, }) if len(folders) == 0: if create_on_not_exist: folder = self._create_folder(folder_name, parent_folder_id) return folder else: raise Exception(f'Folder `{folder_name}` does not exist') elif len(folders) == 1: return folders[0] else: raise Exception(f'Detect duplicate folder `{folder_name}`') def _get_folder_by_names(self, folder_names, root_folder_id=None, create_on_not_exist=False): parent_folder_id = root_folder_id for folder_name in folder_names: folder = self._get_folder_by_name( folder_name, parent_folder_id=parent_folder_id, create_on_not_exist=create_on_not_exist) parent_folder_id = folder['id'] return folder def _delete_item(self, item_id): self.service.files().delete(fileId=item_id).execute() def change_directory(self, serialized_name): self.current_working_folder_id = None if serialized_name: cwd = self.deserialize_name(serialized_name) current_working_folder = self._get_folder_by_names(cwd) if not current_working_folder: raise Exception('Base directory does not exist') self.current_working_folder_id = current_working_folder['id'] def to_remote_mime_type(self, local_mime_type): return local_mime_type # https://developers.google.com/drive/api/v3/ref-export-formats def to_local_mime_type(self, remote_mime_type): if remote_mime_type == GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value: return Storage.MIMETypeEnum.TEXT.value elif remote_mime_type == GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value: return Storage.MIMETypeEnum.CSV.value else: raise Exception(f'Incompatible mime type: {remote_mime_type}') def to_local_extension(self, remote_mime_type): if remote_mime_type == GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value: return 'txt' elif remote_mime_type == GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value: return 'csv' else: return None def list_directory(self, name=None, **kwargs): if not name: parent_folder_id = self.current_working_folder_id else: folder = self._get_folder_by_names( name, root_folder_id=self.current_working_folder_id) parent_folder_id = folder['id'] items = self._list_all_items({ 'parents': parent_folder_id, 'trashed': False, }) return items def create_directory(self, name, **kwargs): parent_folder_id = self.current_working_folder_id for folder_name in name: folder = self._get_folder_by_name( folder_name, parent_folder_id=parent_folder_id, create_on_not_exist=True) parent_folder_id = folder['id'] return folder def delete_directory(self, name, **kwargs): folder = self._get_folder_by_names( name, root_folder_id=self.current_working_folder_id) self._delete_item(folder['id']) def _upload_file(self, local_names, remote_folder_id): local_mime_type = self.fs.get_mime_type(local_names) remote_mime_type = self.to_remote_mime_type(local_mime_type) file_metadata = { 'name': local_names[-1], 'mimeType': remote_mime_type, 'parents': [remote_folder_id], } media = MediaFileUpload(self.fs.serialize_name(local_names), mimetype=local_mime_type) file_fields = ','.join(GoogleDriveStorage.file_fields) uploaded_file = self.service.files().create( body=file_metadata, media_body=media, fields=file_fields).execute() return uploaded_file def upload_file(self, local_names, remote_names, **kwargs): folder = self._get_folder_by_names( remote_names, root_folder_id=self.current_working_folder_id, create_on_not_exist=True) self._upload_file(local_names, folder['id']) def upload_directory(self, local_names, remote_names, **kwargs): folder = self._get_folder_by_names( remote_names, root_folder_id=self.current_working_folder_id, create_on_not_exist=True) local_dir = self.fs.serialize_name(local_names) items = self.fs.list_directory(local_names) for item in items: local_items = [*local_names, item] if self.fs.is_file(local_items): self._upload_file(local_items, folder['id']) elif self.fs.is_directory(local_items): self.upload_directory(local_items, [*remote_names, item]) def _download_file(self, remote_file, local_names): remote_mime_type = remote_file['mimeType'] file_id = remote_file['id'] if GoogleDriveStorage.is_google_doc_format(remote_mime_type): local_mime_type = self.to_local_mime_type(remote_mime_type) local_extension = self.to_local_extension(remote_mime_type) if local_extension: local_names[-1] = f'{local_names[-1]}.{local_extension}' request = self.service.files().export_media( fileId=file_id, mimeType=local_mime_type) else: request = self.service.files().get_media(fileId=file_id) fh = io.FileIO(self.fs.serialize_name(local_names), mode='wb') downloader = MediaIoBaseDownload(fh, request) done = False while done is False: status, done = downloader.next_chunk() # print("Download %d%%." % int(status.progress() * 100)) def download_file(self, remote_names, local_names, **kwargs): self.fs.create_directory(local_names) *remote_folder_names, remote_file_name = remote_names folder = self._get_folder_by_names( remote_folder_names, root_folder_id=self.current_working_folder_id) remote_file = self._get_file_by_name(remote_file_name, parent_folder_id=folder['id']) self._download_file(remote_file, [*local_names, remote_file_name]) def download_directory(self, remote_names, local_names, **kwargs): self.fs.create_directory(local_names) folder = self._get_folder_by_names( remote_names, root_folder_id=self.current_working_folder_id) items = self.list_directory(remote_names) for item in items: local_items = [*local_names, item['name']] if item['mimeType'] == GoogleDriveStorage.MIMETypeEnum.FOLDER.value: self.download_directory([*remote_names, item['name']], local_items) else: self._download_file(item, local_items)