def __init__(self, credential_path=None, **kwargs):
     if not credential_path:
         raise Exception('Credential path is required')
     super().__init__(**kwargs)
     credentials = Credentials.from_service_account_file(
         credential_path, scopes=GoogleDriveStorage.scopes)
     self.service = build('drive',
                          'v3',
                          credentials=credentials,
                          cache_discovery=False)
     self.current_working_folder_id = None
     self.fs = FileSystem()
Exemple #2
0
def create_logger():
    lation_logger = logging.getLogger()
    lation_logger.setLevel(logging.INFO)
    fs = FileSystem()
    dir_path = fs.create_directory(['logs'])

    # file handler
    filename = '{:%Y-%m-%d %H-%M-%S.%f}.log'.format(datetime.now())
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler = logging.FileHandler(os.path.join(dir_path, filename), 'a',
                                       'utf-8')
    file_handler.setFormatter(formatter)
    lation_logger.addHandler(file_handler)

    # console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(formatter)
    lation_logger.addHandler(console_handler)

    return lation_logger
Exemple #3
0
    def __init__(self, url=None,
                       dialect=None, driver=None, username=None, password=None, host=None, port=None, database=None,
                       model_agnostic=False):
        from lation.core.orm import Base
        from lation.modules.base.models.lation_data import LationData

        if not url:
            url = f'{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}'
        echo = bool(DEBUG_SQL)
        self.engine = create_engine(url, pool_size=1, echo=echo, logging_name='lation.engine')
        SessionFactory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
        self.Session = scoped_session(SessionFactory)
        self.model_agnostic = model_agnostic
        existing_metadata = Database.get_metadata()
        existing_metadata.reflect(bind=self.engine, schema=APP)
        self.existing_metadata = existing_metadata
        self.metadata = Base.metadata
        self.fs = FileSystem()
        if not echo:
            self.logger = create_logger()

        self.Base = Base
        self.LationData = LationData
        self.lation_id_map = {}
Exemple #4
0
class Vault():
    @staticmethod
    def to_encrypted_name(name):
        return f'{name}.encrypted'

    @staticmethod
    def to_decrypted_name(name):
        return '.'.join(name.split('.')[:-1])

    def __init__(self, password):
        self.ansible_vault = AnsibleVault(password)
        self.fs = FileSystem()

    def encrypt(self, src, dest=None):
        srcs = self.fs.deserialize_name(src)
        assert self.fs.is_file(srcs), 'src should be a file'
        if not dest:
            dests = srcs.copy()
            dests[-1] = Vault.to_encrypted_name(srcs[-1])
            dest = self.fs.serialize_name(dests)
        with open(src, 'r') as input_file:
            raw_data = input_file.read()
        with open(dest, 'wb') as output_file:
            self.ansible_vault.dump(raw_data, output_file)

    def decrypt(self, src, dest=None):
        srcs = self.fs.deserialize_name(src)
        assert self.fs.is_file(srcs), 'src should be a file'
        if not dest:
            dests = srcs.copy()
            dests[-1] = Vault.to_decrypted_name(srcs[-1])
            dest = self.fs.serialize_name(dests)
        with open(src, 'r') as input_file:
            encrypted_data = input_file.read()
            try:
                decrypted_data = self.ansible_vault.load(encrypted_data)
            except AnsibleVaultError as e:
                raise Exception('Decrypt failed')
        with open(dest, 'w') as output_file:
            output_file.write(decrypted_data)
Exemple #5
0
class Database():

    @staticmethod
    def get_metadata():
        return MetaData(schema=APP)

    def __init__(self, url=None,
                       dialect=None, driver=None, username=None, password=None, host=None, port=None, database=None,
                       model_agnostic=False):
        from lation.core.orm import Base
        from lation.modules.base.models.lation_data import LationData

        if not url:
            url = f'{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}'
        echo = bool(DEBUG_SQL)
        self.engine = create_engine(url, pool_size=1, echo=echo, logging_name='lation.engine')
        SessionFactory = sessionmaker(bind=self.engine, autoflush=False, autocommit=False)
        self.Session = scoped_session(SessionFactory)
        self.model_agnostic = model_agnostic
        existing_metadata = Database.get_metadata()
        existing_metadata.reflect(bind=self.engine, schema=APP)
        self.existing_metadata = existing_metadata
        self.metadata = Base.metadata
        self.fs = FileSystem()
        if not echo:
            self.logger = create_logger()

        self.Base = Base
        self.LationData = LationData
        self.lation_id_map = {}

    def get_session(self):
        try:
            session = self.Session()
            return session
        finally:
            session.close()

    @functools.lru_cache()
    def find_tablename_by_file_path(self, file_path):
        filename = os.path.basename(file_path)
        table_name = os.path.splitext(filename)[0]
        return table_name

    # def find_table_by_file_path(self, file_path):
    #     table_name = self.find_tablename_by_file_path(file_path)
    #     return self.metadata.tables[f'{APP}.{table_name}']

    @functools.lru_cache()
    def find_model_class_by_tablename(self, tablename):
        for model_class in self.Base._decl_class_registry.values():
            if hasattr(model_class, '__tablename__') and model_class.__tablename__ == tablename:
                return model_class
        return None

    def is_json_column(self, column):
        return isinstance(column.type, JSON)

    def is_json_attribute(self, attribute):
        return isinstance(attribute.columns[0].type, JSON)

    def is_boolean_attribute(self, attribute):
        return isinstance(attribute.columns[0].type, Boolean)

    def export(self, dir_path):
        self.fs.create_directory(self.fs.deserialize_name(dir_path))
        for table in self.metadata.sorted_tables:
            self.logger.info(f'EXPORT TABLE `{table.name}`...')
            file_path = os.path.join(dir_path, f'{table.name}.csv')
            with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
                column_names = [column.name for column in table.columns]
                writer = csv.DictWriter(csv_file, fieldnames=column_names)
                writer.writeheader()
                rows = self.engine.execute(table.select())
                for row in rows:
                    row_dict = dict(row)
                    writer.writerow(row_dict)
            self.logger.info(f'EXPORT TABLE `{table.name}` DONE')

    def drop_schema(self, schema_name):
        self.engine.execute(DropSchema(schema_name))
        self.logger.info(f'DELETE SCHEMA `{schema_name}`')

    def create_schema(self, schema_name):
        self.engine.execute(CreateSchema(schema_name))
        self.logger.info(f'CREATE SCHEMA `{schema_name}`')

    def drop_tables(self):
        self.existing_metadata.drop_all(self.engine)
        self.logger.info('DELETE ALL TABLES')

    # drop tables one by one
    def destruct_tables(self):
        for table in reversed(self.metadata.sorted_tables):
            session.execute(table.delete())

    def create_tables(self):
        self.metadata.create_all(self.engine)
        self.logger.info(f'ALL TABLES CREATED')

    def dispose(self):
        self.engine.dispose()

    def install_data(self, module_name):
        from lation.core.orm import SingleTableInheritanceMixin

        for parent_module in modules[module_name].parent_modules:
            self.install_data(parent_module.name)
        start_time = time.time()
        partial_csv_file_paths = modules[module_name].config.data
        if len(partial_csv_file_paths) == 0:
            self.logger.info(f'[{module_name}] NO DATA, SKIPPED')
            return
        self.logger.info(f'[{module_name}] INSTALL DATA...')
        session = self.get_session()

        for partial_csv_file_path in partial_csv_file_paths:
            current_table_lation_id_map = {}
            current_table_lation_id_unflushed_instance_map = {}

            csv_file_path = os.path.join('lation', 'modules', module_name, partial_csv_file_path)
            tablename = self.find_tablename_by_file_path(csv_file_path)
            model_class = self.find_model_class_by_tablename(tablename)
            if not model_class:
                raise Exception(f'Table `{tablename}` does not exist')
            is_single_table_inherited = SingleTableInheritanceMixin in model_class.mro()
            inspector = inspect(model_class)
            json_type_attribute_names = [attr.key for attr in inspector.mapper.column_attrs if self.is_json_attribute(attr)]
            boolean_type_attribute_names = [attr.key for attr in inspector.mapper.column_attrs if self.is_boolean_attribute(attr)]
            self.logger.info(f'[{module_name}] INTO TABLE `{tablename}` FROM PATH `{csv_file_path}`')

            with open(csv_file_path, newline='', encoding='utf-8') as csv_file:
                reader = csv.DictReader(csv_file)
                for csv_data in reader:
                    attribute_data = {}

                    # resolve lation_id
                    lation_id = csv_data.get('lation_id')
                    if not lation_id:
                        raise Exception(f'Attribute `lation_id` is required for csv file `{csv_file_path}`')
                    lation_id_parts = lation_id.split('.')
                    if len(lation_id_parts) < 2 or lation_id_parts[0] != module_name:
                        lation_id = f'{module_name}.{lation_id}'
                        attribute_data['lation_id'] = lation_id
                        del csv_data['lation_id']

                    for attribute_name in csv_data.keys():
                        csv_value = csv_data[attribute_name]
                        attribute_name_parts = attribute_name.split('/')
                        if csv_value == '':
                            continue

                        # resolve primitive data type
                        if len(attribute_name_parts) == 1:
                            if attribute_name in json_type_attribute_names:
                                attribute_data[attribute_name] = ast.literal_eval(csv_value)
                            elif attribute_name in boolean_type_attribute_names:
                                attribute_data[attribute_name] = (csv_value.lower() == 'true')
                            else:
                                attribute_data[attribute_name] = csv_value

                        # resolve foreign key
                        elif len(attribute_name_parts) == 2 and attribute_name_parts[1] == 'fk':
                            foreign_lation_id_parts = csv_value.split('.')
                            if len(foreign_lation_id_parts) < 2:
                                foreign_lation_id = f'{module_name}.{csv_value}'
                            elif len(foreign_lation_id_parts) == 2:
                                foreign_lation_id = csv_value
                            elif len(foreign_lation_id_parts) > 2:
                                raise NotImplementedError
                            # foreign_instance = session.query(model_class).filter(model_class.lation_id == foreign_lation_id).one_or_none()
                            foreign_instance_id = self.lation_id_map[foreign_lation_id]
                            if not foreign_instance_id:
                                raise Exception(f'Foreign lation_id `{foreign_lation_id}` not found')
                            attribute_data[attribute_name_parts[0]] = foreign_instance_id

                        else:
                            raise NotImplementedError

                    # correct model_class for single table inheritence
                    if is_single_table_inherited:
                        polymorphic_identity = attribute_data.get(inspector.polymorphic_on.key)
                        model_class = inspector.polymorphic_map.get(polymorphic_identity).class_manager.class_

                    # upsert instance
                    instance = session.query(model_class).filter(model_class.lation_id == lation_id).one_or_none()
                    if instance:
                        for attribute_name in attribute_data:
                            setattr(instance, attribute_name, attribute_data[attribute_name])
                        current_table_lation_id_map[lation_id] = instance.id
                        # self.lation_id_map[lation_id] = instance.id
                    else:
                        instance = model_class(**attribute_data)
                        session.add(instance)
                        current_table_lation_id_unflushed_instance_map[lation_id] = instance

            # flush to get instance ids
            session.flush()

            # refresh id
            for lation_id in current_table_lation_id_unflushed_instance_map:
                instance = current_table_lation_id_unflushed_instance_map[lation_id]
                # self.lation_id_map[lation_id] = instance.id
                current_table_lation_id_map[lation_id] = instance.id

            self.lation_id_map.update(current_table_lation_id_map)

            # rebalance lation data
            LationData = self.LationData
            lation_data = session.query(LationData).filter(LationData.model == tablename).all()
            deleted_ids = []
            for ld in lation_data:
                if not self.lation_id_map.get(ld.model_lation_id):
                    deleted_ids.append(ld.model_id)
            if len(deleted_ids) > 0:
                self.logger.info(f'[{module_name}] DELETE IDs {deleted_ids} FROM TABLE `{tablename}`...')
                instances = session.query(model_class).filter(model_class.id.in_(deleted_ids)).all()
                for instance in instances:
                    session.delete(instance)

            session.query(LationData).filter(LationData.model == tablename).delete()
            for current_table_lation_id in current_table_lation_id_map:
                lation_data = LationData(model=tablename,
                                         model_lation_id=current_table_lation_id,
                                         model_id=current_table_lation_id_map[current_table_lation_id])
                session.add(lation_data)

            # fix postgres sequence, see <https://stackoverflow.com/a/37972960/2443984>
            if session.bind.dialect.name == 'postgresql':
                for table in inspector.tables:
                    session.execute(f'SELECT setval(pg_get_serial_sequence(\'{table.fullname}\', \'id\'), coalesce(max(id)+1, 1), false) FROM {table.fullname};')

            session.flush()
            self.logger.info(f'[{module_name}] FLUSHED')

        self.logger.info(f'[{module_name}] COMMIT...')
        session.commit()
        self.logger.info(f'[{module_name}] COMMITTED')
        self.logger.info(f'[{module_name}] INSTALL DATA DONE IN {time.time() - start_time}s')

    def reset(self):
        if self.engine.dialect.has_schema(self.engine, schema=APP):
            self.drop_tables()
            self.drop_schema(APP)
        self.create_schema(APP)
        self.create_tables()
        self.install_data(APP)
Exemple #6
0
 def __init__(self, password):
     self.ansible_vault = AnsibleVault(password)
     self.fs = FileSystem()
class GoogleDriveStorage(RemoteStorage):
    # https://developers.google.com/drive/api/v3/mime-types
    class MIMETypeEnum(enum.Enum):
        DOCUMENT = 'application/vnd.google-apps.document'  # Google Docs
        DRAWING = 'application/vnd.google-apps.drawing'  # Google Drawing
        FILE = 'application/vnd.google-apps.file'  # Google Drive file
        FOLDER = 'application/vnd.google-apps.folder'  # Google Drive folder
        PRESENTATION = 'application/vnd.google-apps.presentation'  # Google Slides
        SPREAD_SHEET = 'application/vnd.google-apps.spreadsheet'  # Google Sheets
        UNKNOWN = 'application/vnd.google-apps.unknown'

    # https://developers.google.com/drive/api/v3/about-auth
    scopes = [
        'https://www.googleapis.com/auth/drive',
    ]

    file_fields = ['id', 'name', 'mimeType', 'parents']

    @staticmethod
    def is_google_doc_format(mime_type):
        return mime_type in [
            GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value,
            GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value,
            GoogleDriveStorage.MIMETypeEnum.DRAWING.value,
            GoogleDriveStorage.MIMETypeEnum.PRESENTATION.value
        ]

    def __init__(self, credential_path=None, **kwargs):
        if not credential_path:
            raise Exception('Credential path is required')
        super().__init__(**kwargs)
        credentials = Credentials.from_service_account_file(
            credential_path, scopes=GoogleDriveStorage.scopes)
        self.service = build('drive',
                             'v3',
                             credentials=credentials,
                             cache_discovery=False)
        self.current_working_folder_id = None
        self.fs = FileSystem()

    def _list_all_items(self, query=None):
        page_token = None
        accumulated_files = []
        if query == None:
            query = {}
        query_str = GoogleDriveUtility.get_query_str(**query)
        file_fields = ','.join(GoogleDriveStorage.file_fields)
        while True:
            response = self.service.files().list(
                q=query_str,
                fields=f'nextPageToken, files({file_fields})',
                pageToken=page_token).execute()
            files = response.get('files', [])
            page_token = response.get('nextPageToken', None)
            accumulated_files.extend(files)
            if page_token is None:
                return accumulated_files

    def _create_folder(self, folder_name, parent_folder_id=None):
        file_metadata = {
            'name':
            folder_name,
            'mimeType':
            GoogleDriveStorage.MIMETypeEnum.FOLDER.value,
            'parents':
            parent_folder_id
            if isinstance(parent_folder_id, list) else [parent_folder_id],
        }
        file_fields = ','.join(GoogleDriveStorage.file_fields)
        folder = self.service.files().create(body=file_metadata,
                                             fields=file_fields).execute()
        return folder

    def _get_file_by_name(self, file_name, parent_folder_id=None):
        files = self._list_all_items({
            'name':
            file_name,
            'parents':
            parent_folder_id,
            'not_mime_type':
            GoogleDriveStorage.MIMETypeEnum.FOLDER.value,
            'trashed':
            False,
        })
        if len(files) == 0:
            raise Exception(f'File `{file_name}` does not exist')
        elif len(files) == 1:
            return files[0]
        else:
            raise Exception(f'Detect duplicate file `{file_name}`')

    def _get_folder_by_name(self,
                            folder_name,
                            parent_folder_id=None,
                            create_on_not_exist=False):
        folders = self._list_all_items({
            'name':
            folder_name,
            'parents':
            parent_folder_id,
            'mime_type':
            GoogleDriveStorage.MIMETypeEnum.FOLDER.value,
            'trashed':
            False,
        })
        if len(folders) == 0:
            if create_on_not_exist:
                folder = self._create_folder(folder_name, parent_folder_id)
                return folder
            else:
                raise Exception(f'Folder `{folder_name}` does not exist')
        elif len(folders) == 1:
            return folders[0]
        else:
            raise Exception(f'Detect duplicate folder `{folder_name}`')

    def _get_folder_by_names(self,
                             folder_names,
                             root_folder_id=None,
                             create_on_not_exist=False):
        parent_folder_id = root_folder_id
        for folder_name in folder_names:
            folder = self._get_folder_by_name(
                folder_name,
                parent_folder_id=parent_folder_id,
                create_on_not_exist=create_on_not_exist)
            parent_folder_id = folder['id']
        return folder

    def _delete_item(self, item_id):
        self.service.files().delete(fileId=item_id).execute()

    def change_directory(self, serialized_name):
        self.current_working_folder_id = None
        if serialized_name:
            cwd = self.deserialize_name(serialized_name)
            current_working_folder = self._get_folder_by_names(cwd)
            if not current_working_folder:
                raise Exception('Base directory does not exist')
            self.current_working_folder_id = current_working_folder['id']

    def to_remote_mime_type(self, local_mime_type):
        return local_mime_type

    # https://developers.google.com/drive/api/v3/ref-export-formats
    def to_local_mime_type(self, remote_mime_type):
        if remote_mime_type == GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value:
            return Storage.MIMETypeEnum.TEXT.value
        elif remote_mime_type == GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value:
            return Storage.MIMETypeEnum.CSV.value
        else:
            raise Exception(f'Incompatible mime type: {remote_mime_type}')

    def to_local_extension(self, remote_mime_type):
        if remote_mime_type == GoogleDriveStorage.MIMETypeEnum.DOCUMENT.value:
            return 'txt'
        elif remote_mime_type == GoogleDriveStorage.MIMETypeEnum.SPREAD_SHEET.value:
            return 'csv'
        else:
            return None

    def list_directory(self, name=None, **kwargs):
        if not name:
            parent_folder_id = self.current_working_folder_id
        else:
            folder = self._get_folder_by_names(
                name, root_folder_id=self.current_working_folder_id)
            parent_folder_id = folder['id']
        items = self._list_all_items({
            'parents': parent_folder_id,
            'trashed': False,
        })
        return items

    def create_directory(self, name, **kwargs):
        parent_folder_id = self.current_working_folder_id
        for folder_name in name:
            folder = self._get_folder_by_name(
                folder_name,
                parent_folder_id=parent_folder_id,
                create_on_not_exist=True)
            parent_folder_id = folder['id']
        return folder

    def delete_directory(self, name, **kwargs):
        folder = self._get_folder_by_names(
            name, root_folder_id=self.current_working_folder_id)
        self._delete_item(folder['id'])

    def _upload_file(self, local_names, remote_folder_id):
        local_mime_type = self.fs.get_mime_type(local_names)
        remote_mime_type = self.to_remote_mime_type(local_mime_type)
        file_metadata = {
            'name': local_names[-1],
            'mimeType': remote_mime_type,
            'parents': [remote_folder_id],
        }
        media = MediaFileUpload(self.fs.serialize_name(local_names),
                                mimetype=local_mime_type)
        file_fields = ','.join(GoogleDriveStorage.file_fields)
        uploaded_file = self.service.files().create(
            body=file_metadata, media_body=media,
            fields=file_fields).execute()
        return uploaded_file

    def upload_file(self, local_names, remote_names, **kwargs):
        folder = self._get_folder_by_names(
            remote_names,
            root_folder_id=self.current_working_folder_id,
            create_on_not_exist=True)
        self._upload_file(local_names, folder['id'])

    def upload_directory(self, local_names, remote_names, **kwargs):
        folder = self._get_folder_by_names(
            remote_names,
            root_folder_id=self.current_working_folder_id,
            create_on_not_exist=True)
        local_dir = self.fs.serialize_name(local_names)
        items = self.fs.list_directory(local_names)
        for item in items:
            local_items = [*local_names, item]
            if self.fs.is_file(local_items):
                self._upload_file(local_items, folder['id'])
            elif self.fs.is_directory(local_items):
                self.upload_directory(local_items, [*remote_names, item])

    def _download_file(self, remote_file, local_names):
        remote_mime_type = remote_file['mimeType']
        file_id = remote_file['id']
        if GoogleDriveStorage.is_google_doc_format(remote_mime_type):
            local_mime_type = self.to_local_mime_type(remote_mime_type)
            local_extension = self.to_local_extension(remote_mime_type)
            if local_extension:
                local_names[-1] = f'{local_names[-1]}.{local_extension}'
            request = self.service.files().export_media(
                fileId=file_id, mimeType=local_mime_type)
        else:
            request = self.service.files().get_media(fileId=file_id)

        fh = io.FileIO(self.fs.serialize_name(local_names), mode='wb')
        downloader = MediaIoBaseDownload(fh, request)

        done = False
        while done is False:
            status, done = downloader.next_chunk()
            # print("Download %d%%." % int(status.progress() * 100))

    def download_file(self, remote_names, local_names, **kwargs):
        self.fs.create_directory(local_names)
        *remote_folder_names, remote_file_name = remote_names
        folder = self._get_folder_by_names(
            remote_folder_names, root_folder_id=self.current_working_folder_id)
        remote_file = self._get_file_by_name(remote_file_name,
                                             parent_folder_id=folder['id'])
        self._download_file(remote_file, [*local_names, remote_file_name])

    def download_directory(self, remote_names, local_names, **kwargs):
        self.fs.create_directory(local_names)
        folder = self._get_folder_by_names(
            remote_names, root_folder_id=self.current_working_folder_id)
        items = self.list_directory(remote_names)
        for item in items:
            local_items = [*local_names, item['name']]
            if item['mimeType'] == GoogleDriveStorage.MIMETypeEnum.FOLDER.value:
                self.download_directory([*remote_names, item['name']],
                                        local_items)
            else:
                self._download_file(item, local_items)