Esempio n. 1
0
def test_auto_sample():
    with UI(None, logging.DEBUG, stdout=False) as ui:
        data = 'tests/fixtures/criteo_top30_1m.csv.gz'
        encoding = investigate_encoding_and_dialect(data, None, ui)
        assert auto_sampler(data, encoding, ui) == 14980
        ui.close()
Esempio n. 2
0
def test_auto_small_dataset():
    with UI(None, logging.DEBUG, stdout=False) as ui:
        data = 'tests/fixtures/regression_jp.csv.gz'
        encoding = investigate_encoding_and_dialect(data, None, ui)
        assert auto_sampler(data, encoding, ui) == 500
Esempio n. 3
0
def run_batch_predictions(base_url, base_headers, user, pwd,
                          api_token, create_api_token,
                          pid, lid, n_retry, concurrent,
                          resume, n_samples,
                          out_file, keep_cols, delimiter,
                          dataset, pred_name,
                          timeout, ui, fast_mode, auto_sample,
                          dry_run, encoding, skip_dialect,
                          skip_row_id=False,
                          output_delimiter=None,
                          max_batch_size=None):

    if max_batch_size is None:
        max_batch_size = MAX_BATCH_SIZE

    multiprocessing.freeze_support()
    t1 = time()
    queue_size = concurrent * 2
    #  provide version info and system info in user-agent
    base_headers['User-Agent'] = 'datarobot_batch_scoring/{}|' \
                                 'Python/{}|{}|system/{}|concurrency/{}' \
                                 ''.format(__version__,
                                           sys.version.split(' ')[0],
                                           requests.utils.default_user_agent(),
                                           platform.system(),
                                           concurrent)

    with ExitStack() as stack:
        if os.name is 'nt':
            #  Windows requires an additional manager process. The locks
            #  and queues it creates are proxies for objects that exist within
            #  the manager itself. It does not perform as well so we only
            #  use it when necessary.
            conc_manager = stack.enter_context(multiprocessing.Manager())
        else:
            #  You're on a nix of some sort and don't need a manager process.
            conc_manager = multiprocessing
        queue = conc_manager.Queue(queue_size)
        deque = conc_manager.Queue(queue_size)
        lock = conc_manager.Lock()
        rlock = conc_manager.RLock()
        if not api_token:
            if not pwd:
                pwd = ui.getpass()
            try:
                api_token = acquire_api_token(base_url, base_headers, user,
                                              pwd, create_api_token, ui)
            except Exception as e:
                ui.fatal(str(e))

        base_headers['content-type'] = 'text/csv; charset=utf8'
        endpoint = base_url + '/'.join((pid, lid, 'predict'))
        encoding = investigate_encoding_and_dialect(
            dataset=dataset,
            sep=delimiter, ui=ui,
            fast=fast_mode,
            encoding=encoding,
            skip_dialect=skip_dialect,
            output_delimiter=output_delimiter)
        if auto_sample:
            #  override n_sample
            n_samples = auto_sampler(dataset, encoding, ui)
            ui.info('auto_sample: will use batches of {} rows'
                    ''.format(n_samples))
        # Make a sync request to check authentication and fail early
        first_row = peek_row(dataset, delimiter, ui, fast_mode, encoding)
        ui.debug('First row for auth request: {}'.format(first_row))
        if fast_mode:
            chunk_formatter = fast_to_csv_chunk
        else:
            chunk_formatter = slow_to_csv_chunk
        first_row_data = chunk_formatter(first_row.data, first_row.fieldnames)
        first_row = first_row._replace(data=first_row_data)
        if not dry_run:
            authorize(user, api_token, n_retry, endpoint, base_headers,
                      first_row, ui)

        ctx = stack.enter_context(
            RunContext.create(resume, n_samples, out_file, pid,
                              lid, keep_cols, n_retry, delimiter,
                              dataset, pred_name, ui, fast_mode,
                              encoding, skip_row_id, output_delimiter, lock))
        network = stack.enter_context(Network(concurrent, timeout, ui))
        n_batches_checkpointed_init = len(ctx.db['checkpoints'])
        ui.debug('number of batches checkpointed initially: {}'
                 .format(n_batches_checkpointed_init))

        # make the queue twice as big as the

        MGBQ = MultiprocessingGeneratorBackedQueue(ui, queue, deque, rlock)
        batch_generator_args = ctx.batch_generator_args()
        shovel = Shovel(queue, batch_generator_args, ui)
        ui.info('Shovel go...')
        t2 = time()
        shovel.go()
        ui.info('shoveling complete | total time elapsed {}s'
                .format(time() - t2))

        work_unit_gen = WorkUnitGenerator(MGBQ,
                                          endpoint,
                                          headers=base_headers,
                                          user=user,
                                          api_token=api_token,
                                          ctx=ctx,
                                          pred_name=pred_name,
                                          fast_mode=fast_mode,
                                          ui=ui,
                                          max_batch_size=max_batch_size)
        t0 = time()
        i = 0

        if dry_run:
            for _ in work_unit_gen:
                pass
            ui.info('dry-run complete | time elapsed {}s'.format(time() - t0))
            ui.info('dry-run complete | total time elapsed {}s'.format(
                time() - t1))
        else:
            for r in network.perform_requests(work_unit_gen):
                i += 1
                ui.info('{} responses sent | time elapsed {}s'
                        .format(i, time() - t0))

            ui.debug('list of checkpointed batches: {}'
                     .format(sorted(ctx.db['checkpoints'])))
            n_batches_checkpointed = (len(ctx.db['checkpoints']) -
                                      n_batches_checkpointed_init)
            ui.debug('number of batches checkpointed: {}'
                     .format(n_batches_checkpointed))
            n_batches_not_checkpointed = (work_unit_gen.queue.n_consumed -
                                          n_batches_checkpointed)
            batches_missing = n_batches_not_checkpointed > 0
            if batches_missing:
                ui.fatal(('scoring incomplete, {} batches were dropped | '
                          'time elapsed {}s')
                         .format(n_batches_not_checkpointed, time() - t0))
            else:
                ui.info('scoring complete | time elapsed {}s'
                        .format(time() - t0))
                ui.info('scoring complete | total time elapsed {}s'
                        .format(time() - t1))

            total_done = 0
            for _, batch_len in ctx.db["checkpoints"]:
                total_done += batch_len

            total_lost = 0
            for bucket in ("warnings", "errors"):
                ui.info('==== Scoring {} ===='.format(bucket))
                if ctx.db[bucket]:
                    msg_data = ctx.db[bucket]
                    msg_keys = sorted(msg_data.keys())
                    for batch_id in msg_keys:
                        first = True
                        for msg in msg_data[batch_id]:
                            if first:
                                first = False
                                ui.info("{}: {}".format(batch_id, msg))
                            else:
                                ui.info("        {}".format(msg))

                        if bucket == "errors":
                            total_lost += batch_id[1]

            ui.info('==== Total stats ===='.format(bucket))
            ui.info("done: {} lost: {}".format(total_done, total_lost))