def init_logging(): """ init logging config :param debug: :return: """ base_logger = logging.getLogger("synch") debug = Settings.debug() if debug: base_logger.setLevel(logging.DEBUG) else: base_logger.setLevel(logging.INFO) fmt = logging.Formatter( fmt="%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) sh = logging.StreamHandler(sys.stdout) sh.setLevel(logging.DEBUG) sh.setFormatter(fmt) base_logger.addHandler(sh) mail = Settings.get("mail") if mail: rate_limit = RateLimitingFilter(per=60) sh = logging.handlers.SMTPHandler( mailhost=mail.get("mailhost"), fromaddr=mail.get("fromaddr"), toaddrs=mail.get("toaddrs"), subject=mail.get("subject"), credentials=(mail.get("user"), mail.get("password")), ) sh.setLevel(logging.ERROR) sh.setFormatter(fmt) sh.addFilter(rate_limit) base_logger.addHandler(sh)
def get_writer(engine: ClickHouseEngine = None, choice=True) -> Union[ClickHouse, List[ClickHouse]]: """ get writer once """ writers = _writers.get(engine) if not choice: return writers if not writers: settings = Settings.get("clickhouse") hosts = settings.get("hosts") if Settings.is_cluster() and len(hosts) <= 1: raise ConfigurationError("hosts must more than one when cluster") for host in hosts: args = [host, settings.get("user"), settings.get("password"), Settings.cluster_name()] if engine == ClickHouseEngine.merge_tree.value: w = ClickHouseMergeTree(*args) elif engine == ClickHouseEngine.collapsing_merge_tree: w = ClickHouseCollapsingMergeTree(*args) elif engine == ClickHouseEngine.versioned_collapsing_merge_tree: w = ClickHouseVersionedCollapsingMergeTree(*args) elif engine == ClickHouseEngine.replacing_merge_tree or engine is None: w = ClickHouseReplacingMergeTree(*args) else: w = ClickHouse(*args) _writers.setdefault(engine, []).append(w) return random.choice(_writers.get(engine)) # nosec:B311
def __init__(self): """ init setting and create redis instance """ settings = Settings.get("redis") self.prefix = settings.get("prefix") self.queue_max_len = settings.get("queue_max_len") self.sentinel = settings.get("sentinel") if self.sentinel: sentinel = Sentinel(sentinels=map(lambda x: x.split(":"), settings.get("sentinel_hosts"))) kwargs = dict( service_name=settings.get("sentinel_master"), password=settings.get("password"), decode_responses=True, ) self.master = sentinel.master_for(**kwargs) self.slave = sentinel.slave_for(**kwargs) else: pool = redis.ConnectionPool( host=settings.get("host"), port=settings.get("port"), db=settings.get("db"), password=settings.get("password"), decode_responses=True, ) self.master = self.slave = redis.StrictRedis(connection_pool=pool)
def init(config_file): """ init """ Settings.init(config_file) init_logging() dsn = Settings.get("sentry", "dsn") if dsn: import sentry_sdk from sentry_sdk.integrations.redis import RedisIntegration sentry_sdk.init( dsn, environment=Settings.get("sentry", "environment"), integrations=[RedisIntegration()], ) if Settings.monitoring(): init_monitor_db()
def __init__(self, alias): super().__init__(alias) self.servers = Settings.get("kafka").get("servers") self.topic = f'{Settings.get("kafka").get("topic_prefix")}.{alias}' self.databases = Settings.get_source_db(alias).get("databases") self.producer = KafkaProducer( bootstrap_servers=self.servers, value_serializer=lambda x: json.dumps(x, cls=JsonEncoder).encode(), key_serializer=lambda x: x.encode(), ) self._init_topic()
def get_writer(engine: ClickHouseEngine = None) -> ClickHouse: """ get writer once """ w = _writers.get(engine) if not w: settings = Settings.get("clickhouse") if engine == ClickHouseEngine.merge_tree.value: w = ClickHouseMergeTree(settings) elif engine == ClickHouseEngine.collapsing_merge_tree: w = ClickHouseCollapsingMergeTree(settings) elif engine == ClickHouseEngine.versioned_collapsing_merge_tree: w = ClickHouseVersionedCollapsingMergeTree(settings) elif engine == ClickHouseEngine.replacing_merge_tree or engine is None: w = ClickHouseReplacingMergeTree(settings) _writers[engine] = w return w
def etl_full( alias: str, schema: str, tables_pk: Dict, renew=False, ): """ full etl """ reader = get_reader(alias) source_db_database = Settings.get_source_db_database(alias, schema) schema = source_db_database.get("database") writer = get_writer() if not writer.check_database_exists(schema): if source_db_database.get("auto_create") is not False: writer.create_database(schema, Settings.cluster_name()) else: logger.warning( f"Can't etl since no database {schema} found in ClickHouse and auto_create=false" ) exit(-1) for table in source_db_database.get("tables"): if table.get("auto_full_etl") is False: continue table_name = table.get("table") pk = tables_pk.get(table_name) writer = get_writer(table.get("clickhouse_engine")) if not pk and not renew: logger.warning(f"No pk found in {schema}.{table_name}, skip") continue elif isinstance(pk, tuple): pk = f"({','.join(pk)})" if renew: drop_sql = f"drop table if exists {schema}.{table_name}" writer.execute(drop_sql) logger.info(f"drop table success:{schema}.{table_name}") if not writer.check_table_exists(schema, table_name): sign_column = table.get("sign_column") version_column = table.get("version_column") writer.execute( writer.get_table_create_sql( reader, schema, table_name, pk, table.get("partition_by"), table.get("engine_settings"), sign_column=sign_column, version_column=version_column, )) if Settings.is_cluster(): for w in get_writer(choice=False): w.execute( w.get_distributed_table_create_sql( schema, table_name, Settings.get("clickhouse.distributed_suffix"))) if reader.fix_column_type and not table.get("skip_decimal"): writer.fix_table_column_type(reader, schema, table_name) full_insert_sql = writer.get_full_insert_sql( reader, schema, table_name, sign_column) writer.execute(full_insert_sql) logger.info(f"full data etl for {schema}.{table_name} success") else: logger.debug( f"{schema}.{table_name} exists, skip, or use --renew force etl with drop old tables" )