Beispiel #1
0
 def _clear_redis_and_force_merge(self) -> None:
     redis_client.flushdb()
     cluster = self.storage.get_cluster()
     clickhouse = cluster.get_query_connection(
         ClickhouseClientSettings.OPTIMIZE)
     run_optimize(clickhouse,
                  self.storage,
                  cluster.get_database(),
                  ignore_cutoff=True)
Beispiel #2
0
def optimize(
    *,
    clickhouse_host: str,
    clickhouse_port: int,
    database: str,
    dataset_name: str,
    timeout: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime
    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import run_optimize, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    today = datetime.utcnow().replace(hour=0,
                                      minute=0,
                                      second=0,
                                      microsecond=0)
    clickhouse = ClickhousePool(clickhouse_host,
                                clickhouse_port,
                                send_receive_timeout=timeout)
    num_dropped = run_optimize(clickhouse, database, table, before=today)
    logger.info("Optimized %s partitions on %s" %
                (num_dropped, clickhouse_host))
Beispiel #3
0
def optimize(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    storage_name: str,
    parallel: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime

    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import logger, run_optimize

    setup_logging(log_level)
    setup_sentry()

    storage: ReadableTableStorage

    storage_key = StorageKey(storage_name)
    storage = get_storage(storage_key)

    (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials()

    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

    database = storage.get_cluster().get_database()

    # TODO: In distributed mode, optimize currently must be run once for each node
    # with the host and port of that node provided via the CLI. In the future,
    # passing this information won't be necessary, and running this command once
    # will ensure that optimize is performed on all of the individual nodes for
    # that cluster.
    if clickhouse_host and clickhouse_port:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
            send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout,
        )
    elif not storage.get_cluster().is_single_node():
        raise click.ClickException("Provide Clickhouse host and port for optimize")
    else:
        connection = storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.OPTIMIZE
        )

    num_dropped = run_optimize(
        connection,
        storage,
        database,
        before=today,
        parallel=parallel,
        clickhouse_host=clickhouse_host,
    )
    logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
Beispiel #4
0
    def flush_batch(self, batch: Sequence[Replacement]) -> None:
        need_optimize = False
        for replacement in batch:
            query_args = {
                **replacement.query_args,
                "table_name":
                self.__replacer_processor.get_schema().get_table_name(),
            }

            if replacement.count_query_template is not None:
                count = self.clickhouse.execute_robust(
                    replacement.count_query_template % query_args)[0][0]
                if count == 0:
                    continue
            else:
                count = 0

            need_optimize = (self.__replacer_processor.pre_replacement(
                replacement, count) or need_optimize)

            if replacement.insert_query_template is not None:
                t = time.time()
                query = replacement.insert_query_template % query_args
                logger.debug("Executing replace query: %s" % query)
                self.clickhouse.execute_robust(query)
                duration = int((time.time() - t) * 1000)

                logger.info("Replacing %s rows took %sms" % (count, duration))
                self.metrics.timing("replacements.count", count)
                self.metrics.timing("replacements.duration", duration)
            else:
                count = duration = 0

            self.__replacer_processor.post_replacement(replacement, duration,
                                                       count)

        if need_optimize:
            from snuba.optimize import run_optimize

            today = datetime.utcnow().replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0)
            num_dropped = run_optimize(
                self.clickhouse,
                self.__storage,
                self.__database_name,
                before=today,
            )
            logger.info("Optimized %s partitions on %s" %
                        (num_dropped, self.clickhouse.host))
Beispiel #5
0
    def flush_batch(self, batch: Sequence[Replacement]) -> None:
        need_optimize = False
        clickhouse_read = self.__storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.REPLACE)

        for replacement in batch:

            start_time = time.time()

            table_name = self.__replacer_processor.get_schema().get_table_name(
            )
            count_query = replacement.get_count_query(table_name)

            if count_query is not None:
                count = clickhouse_read.execute_robust(
                    count_query).results[0][0]
                if count == 0:
                    continue
            else:
                count = 0

            need_optimize = (self.__replacer_processor.pre_replacement(
                replacement, count) or need_optimize)

            query_executor = self.__get_insert_executor(replacement)
            with self.__rate_limiter as state:
                self.metrics.increment("insert_state",
                                       tags={"state": state[0].value})
                count = query_executor.execute(replacement, count)

            self.__replacer_processor.post_replacement(replacement, count)

            self._check_timing_and_write_to_redis(replacement, start_time)

        if need_optimize:
            from snuba.optimize import run_optimize

            today = datetime.utcnow().replace(hour=0,
                                              minute=0,
                                              second=0,
                                              microsecond=0)
            num_dropped = run_optimize(clickhouse_read,
                                       self.__storage,
                                       self.__database_name,
                                       before=today)
            logger.info("Optimized %s partitions on %s" %
                        (num_dropped, clickhouse_read.host))
Beispiel #6
0
def optimize(clickhouse_server, database, table, timeout, log_level):
    from datetime import datetime
    from snuba.clickhouse import ClickhousePool
    from snuba.optimize import run_optimize, logger

    logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s')

    if not clickhouse_server:
        logger.error("Must provide at least one Clickhouse server.")
        sys.exit(1)

    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
    for server in clickhouse_server:
        clickhouse = ClickhousePool(
            server.split(':')[0], port=int(server.split(':')[1]), send_receive_timeout=timeout
        )
        num_dropped = run_optimize(clickhouse, database, table, before=today)
        logger.info("Optimized %s partitions on %s" % (num_dropped, server))
Beispiel #7
0
def optimize(clickhouse_host, clickhouse_port, database, dataset, timeout,
             log_level):
    from datetime import datetime
    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import run_optimize, logger

    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')

    dataset = get_dataset(dataset)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    today = datetime.utcnow().replace(hour=0,
                                      minute=0,
                                      second=0,
                                      microsecond=0)
    clickhouse = ClickhousePool(clickhouse_host,
                                clickhouse_port,
                                send_receive_timeout=timeout)
    num_dropped = run_optimize(clickhouse, database, table, before=today)
    logger.info("Optimized %s partitions on %s" %
                (num_dropped, clickhouse_host))