def get_writer(engine: ClickHouseEngine = None, choice=True) -> Union[ClickHouse, List[ClickHouse]]: """ get writer once """ writers = _writers.get(engine) if not choice: return writers if not writers: settings = Settings.get("clickhouse") hosts = settings.get("hosts") if Settings.is_cluster() and len(hosts) <= 1: raise ConfigurationError("hosts must more than one when cluster") for host in hosts: args = [host, settings.get("user"), settings.get("password"), Settings.cluster_name()] if engine == ClickHouseEngine.merge_tree.value: w = ClickHouseMergeTree(*args) elif engine == ClickHouseEngine.collapsing_merge_tree: w = ClickHouseCollapsingMergeTree(*args) elif engine == ClickHouseEngine.versioned_collapsing_merge_tree: w = ClickHouseVersionedCollapsingMergeTree(*args) elif engine == ClickHouseEngine.replacing_merge_tree or engine is None: w = ClickHouseReplacingMergeTree(*args) else: w = ClickHouse(*args) _writers.setdefault(engine, []).append(w) return random.choice(_writers.get(engine)) # nosec:B311
def etl_full( alias: str, schema: str, tables_pk: Dict, renew=False, ): """ full etl """ reader = get_reader(alias) source_db_database = Settings.get_source_db_database(alias, schema) schema = source_db_database.get("database") writer = get_writer() if not writer.check_database_exists(schema): if source_db_database.get("auto_create") is not False: writer.create_database(schema, Settings.cluster_name()) else: logger.warning( f"Can't etl since no database {schema} found in ClickHouse and auto_create=false" ) exit(-1) for table in source_db_database.get("tables"): if table.get("auto_full_etl") is False: continue table_name = table.get("table") pk = tables_pk.get(table_name) writer = get_writer(table.get("clickhouse_engine")) if not pk and not renew: logger.warning(f"No pk found in {schema}.{table_name}, skip") continue elif isinstance(pk, tuple): pk = f"({','.join(pk)})" if renew: drop_sql = f"drop table if exists {schema}.{table_name}" writer.execute(drop_sql) logger.info(f"drop table success:{schema}.{table_name}") if not writer.check_table_exists(schema, table_name): sign_column = table.get("sign_column") version_column = table.get("version_column") writer.execute( writer.get_table_create_sql( reader, schema, table_name, pk, table.get("partition_by"), table.get("engine_settings"), sign_column=sign_column, version_column=version_column, )) if Settings.is_cluster(): for w in get_writer(choice=False): w.execute( w.get_distributed_table_create_sql( schema, table_name, Settings.get("clickhouse.distributed_suffix"))) if reader.fix_column_type and not table.get("skip_decimal"): writer.fix_table_column_type(reader, schema, table_name) full_insert_sql = writer.get_full_insert_sql( reader, schema, table_name, sign_column) writer.execute(full_insert_sql) logger.info(f"full data etl for {schema}.{table_name} success") else: logger.debug( f"{schema}.{table_name} exists, skip, or use --renew force etl with drop old tables" )