Esempio n. 1
0
def test_parse_host_no_protocol_fatal():
    host = '57a2a9eac808914f2fb8f717.com/api'
    with UI(None, logging.INFO, stdout=False) as ui:
        with mock.patch('datarobot_batch_scoring.utils.UI.fatal') as ui_fatal:
            msg = ('Cannot parse "--host" argument. Host address must start '
                   'with a protocol such as "http://" or "https://".'
                   ' Value given: {}').format(host)
            parse_host(host, ui)
            ui_fatal.assert_called_with(msg)
Esempio n. 2
0
def test_parse_host_no_protocol_fatal():
    host = '57a2a9eac808914f2fb8f717.com/api'
    with UI(None, logging.INFO, stdout=False) as ui:
        with mock.patch('datarobot_batch_scoring.utils.UI.fatal') as ui_fatal:
            msg = ('Cannot parse "--host" argument. Host address must start '
                   'with a protocol such as "http://" or "https://".'
                   ' Value given: {}').format(host)
            parse_host(host, ui)
            ui_fatal.assert_called_with(msg)
Esempio n. 3
0
def test_parse_host(input, expected):
    with UI(None, logging.DEBUG, stdout=False) as ui:
        assert parse_host(input, ui) == expected
Esempio n. 4
0
def test_parse_host(input, expected):
    with UI(None, logging.DEBUG, stdout=False) as ui:
        assert parse_host(input, ui) == expected
Esempio n. 5
0
def main(argv=sys.argv[1:]):
    global ui  # global variable hack, will get rid of a bit later
    warnings.simplefilter("ignore")
    parser = argparse.ArgumentParser(
        description=DESCRIPTION, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "--verbose", "-v", action="store_true", help="Provides status updates while " "the script is running."
    )
    parser.add_argument("--version", action="version", version=VERSION_TEMPLATE, help="Show version")
    dataset_gr = parser.add_argument_group("Dataset and server")
    dataset_gr.add_argument(
        "--host",
        type=str,
        help="Specifies the protocol (http or https) and "
        "hostname of the prediction API endpoint. "
        'E.g. "https://example.orm.datarobot.com"',
    )
    dataset_gr.add_argument("project_id", type=str, help="Specifies the project " "identification string.")
    dataset_gr.add_argument("model_id", type=str, help="Specifies the model identification string.")
    dataset_gr.add_argument("dataset", type=str, help="Specifies the .csv input file that " "the script scores.")
    dataset_gr.add_argument(
        "--out",
        type=str,
        nargs="?",
        default="out.csv",
        help="Specifies the file name, "
        "and optionally path, "
        "to which the results are written. "
        "If not specified, "
        "the default file name is out.csv, "
        "written to the directory containing the script. "
        "(default: %(default)r)",
    )
    auth_gr = parser.add_argument_group("Authentication parameters")
    auth_gr.add_argument(
        "--user",
        type=str,
        help="Specifies the username used to acquire " "the api-token. " "Use quotes if the name contains spaces.",
    )
    auth_gr.add_argument(
        "--password",
        type=str,
        nargs="?",
        help="Specifies the password used to acquire " "the api-token. " "Use quotes if the name contains spaces.",
    )
    auth_gr.add_argument(
        "--api_token",
        type=str,
        nargs="?",
        help="Specifies the api token for the requests; "
        "if you do not have a token, "
        "you must specify the password argument.",
    )
    auth_gr.add_argument(
        "--create_api_token",
        action="store_true",
        default=False,
        help="Requests a new API token. To use this option, "
        "you must specify the "
        "password argument for this request "
        "(not the api_token argument). "
        "(default: %(default)r)",
    )
    auth_gr.add_argument(
        "--datarobot_key",
        type=str,
        nargs="?",
        help="An additional datarobot_key " "for dedicated prediction instances.",
    )
    conn_gr = parser.add_argument_group("Connection control")
    conn_gr.add_argument(
        "--timeout", type=int, default=30, help="The timeout for each post request. " "(default: %(default)r)"
    )
    conn_gr.add_argument(
        "--n_samples",
        type=int,
        nargs="?",
        default=False,
        help="Specifies the number of samples (rows) to use "
        'per batch. If not defined the "auto_sample" option '
        "will be used.",
    )
    conn_gr.add_argument(
        "--n_concurrent",
        type=int,
        nargs="?",
        default=4,
        help="Specifies the number of concurrent requests " "to submit. (default: %(default)r)",
    )
    conn_gr.add_argument(
        "--n_retry",
        type=int,
        default=3,
        help="Specifies the number of times DataRobot "
        "will retry if a request fails. "
        "A value of -1, the default, specifies "
        "an infinite number of retries."
        "(default: %(default)r)",
    )
    conn_gr.add_argument(
        "--resume",
        action="store_true",
        default=False,
        help="Starts the prediction from the point at which "
        "it was halted. "
        "If the prediction stopped, for example due "
        "to error or network connection issue, you can run "
        "the same command with all the same "
        "all arguments plus this resume argument.",
    )
    csv_gr = parser.add_argument_group("CVS parameters")
    csv_gr.add_argument(
        "--keep_cols",
        type=str,
        nargs="?",
        help="Specifies the column names to append " "to the predictions. " "Enter as a comma-separated list.",
    )
    csv_gr.add_argument(
        "--delimiter",
        type=str,
        nargs="?",
        default=None,
        help="Specifies the delimiter to recognize in "
        'the input .csv file. E.g. "--delimiter=,". '
        "If not specified, the script tries to automatically "
        'determine the delimiter. The special keyword "tab" '
        "can be used to indicate a tab delimited csv.",
    )
    csv_gr.add_argument(
        "--pred_name",
        type=str,
        nargs="?",
        default=None,
        help="Specifies column name for prediction results, "
        "empty name is used if not specified. For binary "
        "predictions assumes last class in lexical order "
        "as positive",
    )
    csv_gr.add_argument(
        "--fast",
        action="store_true",
        default=False,
        help="Experimental: faster CSV processor. " "Note: does not support multiline csv. ",
    )
    csv_gr.add_argument(
        "--auto_sample",
        action="store_true",
        default=False,
        help='Override "n_samples" and instead '
        "use chunks of about 1.5 MB. This is recommended and "
        'enabled by default if "n_samples" is not defined.',
    )
    csv_gr.add_argument(
        "--encoding",
        type=str,
        default="",
        help="Declare the dataset encoding. "
        "If an encoding is not provided the batch_scoring "
        'script attempts to detect it. E.g "utf-8", "latin-1" '
        'or "iso2022_jp". See the Python docs for a list of '
        "valid encodings "
        "https://docs.python.org/3/library/codecs.html"
        "#standard-encodings",
    )
    csv_gr.add_argument(
        "--skip_dialect",
        action="store_true",
        default=False,
        help="Tell the batch_scoring script " "to skip csv dialect detection.",
    )
    csv_gr.add_argument("--skip_row_id", action="store_true", default=False, help="Skip the row_id column in output.")
    csv_gr.add_argument("--output_delimiter", type=str, default=None, help="Set the delimiter for output file.")
    misc_gr = parser.add_argument_group("Miscellaneous")
    misc_gr.add_argument("-y", "--yes", dest="prompt", action="store_true", help="Always answer 'yes' for user prompts")
    misc_gr.add_argument("-n", "--no", dest="prompt", action="store_false", help="Always answer 'no' for user prompts")
    misc_gr.add_argument(
        "--dry_run", dest="dry_run", action="store_true", help="Only read/chunk input data but dont send " "requests."
    )
    misc_gr.add_argument(
        "--stdout", action="store_true", dest="stdout", default=False, help="Send all log messages to stdout."
    )

    defaults = {
        "prompt": None,
        "out": "out.csv",
        "create_api_token": False,
        "timeout": 30,
        "n_samples": False,
        "n_concurrent": 4,
        "n_retry": 3,
        "resume": False,
        "fast": False,
        "stdout": False,
        "auto_sample": False,
    }

    conf_file = get_config_file()
    if conf_file:
        file_args = parse_config_file(conf_file)
        defaults.update(file_args)
    parser.set_defaults(**defaults)
    for action in parser._actions:
        if action.dest in defaults and action.required:
            action.required = False
            if "--" + action.dest not in argv:
                action.nargs = "?"
    parsed_args = {k: v for k, v in vars(parser.parse_args(argv)).items() if v is not None}
    loglevel = logging.DEBUG if parsed_args["verbose"] else logging.INFO
    stdout = parsed_args["stdout"]
    ui = UI(parsed_args.get("prompt"), loglevel, stdout)
    printed_args = copy.copy(parsed_args)
    printed_args.pop("password", None)
    ui.debug(printed_args)
    ui.info("platform: {} {}".format(sys.platform, sys.version))

    # parse args
    host = parsed_args["host"]
    pid = parsed_args["project_id"]
    lid = parsed_args["model_id"]
    n_retry = int(parsed_args["n_retry"])
    if parsed_args.get("keep_cols"):
        keep_cols = [s.strip() for s in parsed_args["keep_cols"].split(",")]
    else:
        keep_cols = None
    concurrent = int(parsed_args["n_concurrent"])
    dataset = parsed_args["dataset"]
    n_samples = int(parsed_args["n_samples"])
    delimiter = parsed_args.get("delimiter")
    resume = parsed_args["resume"]
    out_file = parsed_args["out"]
    datarobot_key = parsed_args.get("datarobot_key")
    timeout = int(parsed_args["timeout"])
    fast_mode = parsed_args["fast"]
    auto_sample = parsed_args["auto_sample"]
    if not n_samples:
        auto_sample = True
    encoding = parsed_args["encoding"]
    skip_dialect = parsed_args["skip_dialect"]
    skip_row_id = parsed_args["skip_row_id"]
    output_delimiter = parsed_args.get("output_delimiter")

    if "user" not in parsed_args:
        user = ui.prompt_user()
    else:
        user = parsed_args["user"].strip()

    if not os.path.exists(parsed_args["dataset"]):
        ui.fatal("file {} does not exist.".format(parsed_args["dataset"]))

    try:
        verify_objectid(pid)
        verify_objectid(lid)
    except ValueError as e:
        ui.fatal(str(e))

    if delimiter == "\\t" or delimiter == "tab":
        # NOTE: on bash you have to use Ctrl-V + TAB
        delimiter = "\t"

    if delimiter and delimiter not in VALID_DELIMITERS:
        ui.fatal('Delimiter "{}" is not a valid delimiter.'.format(delimiter))

    if output_delimiter == "\\t" or output_delimiter == "tab":
        # NOTE: on bash you have to use Ctrl-V + TAB
        output_delimiter = "\t"

    if output_delimiter and output_delimiter not in VALID_DELIMITERS:
        ui.fatal('Output delimiter "{}" is not a valid delimiter.'.format(output_delimiter))

    api_token = parsed_args.get("api_token")
    create_api_token = parsed_args.get("create_api_token")
    pwd = parsed_args.get("password")
    pred_name = parsed_args.get("pred_name")
    dry_run = parsed_args.get("dry_run", False)

    base_url = parse_host(host, ui)

    base_headers = {}
    if datarobot_key:
        base_headers["datarobot-key"] = datarobot_key

    ui.debug("batch_scoring v{}".format(__version__))
    ui.info("connecting to {}".format(base_url))

    try:
        run_batch_predictions(
            base_url=base_url,
            base_headers=base_headers,
            user=user,
            pwd=pwd,
            api_token=api_token,
            create_api_token=create_api_token,
            pid=pid,
            lid=lid,
            n_retry=n_retry,
            concurrent=concurrent,
            resume=resume,
            n_samples=n_samples,
            out_file=out_file,
            keep_cols=keep_cols,
            delimiter=delimiter,
            dataset=dataset,
            pred_name=pred_name,
            timeout=timeout,
            ui=ui,
            fast_mode=fast_mode,
            auto_sample=auto_sample,
            dry_run=dry_run,
            encoding=encoding,
            skip_dialect=skip_dialect,
            skip_row_id=skip_row_id,
            output_delimiter=output_delimiter,
        )
    except SystemError:
        pass
    except ShelveError as e:
        ui.error(str(e))
    except KeyboardInterrupt:
        ui.info("Keyboard interrupt")
    except Exception as e:
        ui.fatal(str(e))
    finally:
        ui.close()
Esempio n. 6
0
def parse_generic_options(parsed_args):
    global ui
    loglevel = logging.DEBUG if parsed_args['verbose'] else logging.INFO
    stdout = parsed_args['stdout']
    ui = UI(parsed_args.get('prompt'), loglevel, stdout)

    printed_args = copy.copy(parsed_args)
    printed_args.pop('password', None)
    ui.debug(printed_args)
    ui.info('platform: {} {}'.format(sys.platform, sys.version))
    n_retry = int(parsed_args['n_retry'])
    if parsed_args.get('keep_cols'):
        keep_cols = [s.strip() for s in parsed_args['keep_cols'].split(',')]
    else:
        keep_cols = None
    concurrent = int(parsed_args['n_concurrent'])

    resume = parsed_args['resume']
    compression = parsed_args['compress']
    out_file = parsed_args['out']
    timeout = int(parsed_args['timeout'])
    fast_mode = parsed_args['fast']
    encoding = parsed_args['encoding']
    skip_dialect = parsed_args['skip_dialect']
    skip_row_id = parsed_args['skip_row_id']
    host = parsed_args.get('host')
    pred_name = parsed_args.get('pred_name')
    dry_run = parsed_args.get('dry_run', False)
    base_url = ""

    n_samples = int(parsed_args['n_samples'])
    auto_sample = parsed_args['auto_sample']
    if not n_samples:
        auto_sample = True

    delimiter = parsed_args.get('delimiter')
    if delimiter == '\\t' or delimiter == 'tab':
        # NOTE: on bash you have to use Ctrl-V + TAB
        delimiter = '\t'
    elif delimiter == 'pipe':
        # using the | char has issues on Windows for some reason
        delimiter = '|'
    if delimiter and delimiter not in VALID_DELIMITERS:
        ui.fatal('Delimiter "{}" is not a valid delimiter.'.format(delimiter))

    output_delimiter = parsed_args.get('output_delimiter')
    if output_delimiter == '\\t' or output_delimiter == 'tab':
        # NOTE: on bash you have to use Ctrl-V + TAB
        output_delimiter = '\t'
    elif output_delimiter == 'pipe':
        output_delimiter = '|'
    if output_delimiter and output_delimiter not in VALID_DELIMITERS:
        ui.fatal('Output delimiter "{}" is not a valid delimiter.'.format(
            output_delimiter))

    dataset = parsed_args['dataset']
    if not os.path.exists(dataset):
        ui.fatal('file {} does not exist.'.format(dataset))

    if not dry_run:
        base_url = parse_host(host, ui)

    ui.debug('batch_scoring v{}'.format(__version__))
    ui.info('connecting to {}'.format(base_url))

    return {
        'auto_sample': auto_sample,
        'base_url': base_url,
        'compression': compression,
        'concurrent': concurrent,
        'dataset': dataset,
        'delimiter': delimiter,
        'dry_run': dry_run,
        'encoding': encoding,
        'fast_mode': fast_mode,
        'keep_cols': keep_cols,
        'n_retry': n_retry,
        'n_samples': n_samples,
        'out_file': out_file,
        'output_delimiter': output_delimiter,
        'pred_name': pred_name,
        'resume': resume,
        'skip_dialect': skip_dialect,
        'skip_row_id': skip_row_id,
        'timeout': timeout,
    }