예제 #1
0
def local_anomaly_score(anomalies, test_reader, output, args, exclude=None):
    """Get local anomaly detector and issue anomaly score prediction

    """
    # Only one anomaly detector at present
    local_anomaly = Anomaly(anomalies[0], api=args.retrieve_api_)
    test_set_header = test_reader.has_headers()
    for input_data in test_reader:
        input_data_dict = test_reader.dict(input_data, filtering=False)
        try:
            anomaly_score_info = {"score": local_anomaly.anomaly_score(input_data_dict, by_name=test_set_header)}
        except Exception:
            anomaly_score_info = {"score": NO_ANOMALY_SCORE}
        write_anomaly_score(anomaly_score_info["score"], output, args.prediction_info, input_data, exclude)
예제 #2
0
def local_anomaly_score(anomalies, test_reader, output, args,
                        exclude=None):
    """Get local anomaly detector and issue anomaly score prediction

    """
    # Only one anomaly detector at present
    local_anomaly = Anomaly(anomalies[0], api=args.retrieve_api_)
    for input_data in test_reader:
        input_data_dict = test_reader.dict(input_data, filtering=False)
        try:
            anomaly_score_info = {'score': local_anomaly.anomaly_score(
                input_data_dict)}
        except Exception:
            anomaly_score_info = {'score': NO_ANOMALY_SCORE}
        write_anomaly_score(anomaly_score_info['score'], output,
                            args.prediction_info, input_data, exclude)
예제 #3
0
def i_create_local_anomaly_from_file(step, export_file):
    world.local_anomaly = Anomaly(res_filename(export_file))
예제 #4
0
def create_dataset_with_anomalies(step):
    local_anomalies = Anomaly(world.anomaly['resource'])
    world.dataset = world.api.create_dataset(
        world.dataset['resource'],
        {"lisp_filter": local_anomalies.anomalies_filter()})
    world.datasets.append(world.dataset['resource'])
def i_create_a_local_anomaly(step):
    world.local_anomaly = Anomaly(world.anomaly['resource'])
예제 #6
0
def create_dataset_with_anomalies(step):
    local_anomalies = Anomaly(world.anomaly['resource'])
    world.dataset = world.api.create_dataset(
        world.dataset['resource'],
        {"lisp_filter": local_anomalies.anomalies_filter()})
    world.datasets.append(world.dataset['resource'])
예제 #7
0
def compute_output(api, args):
    """ Creates one or more anomaly detectors using the `training_set`
        or uses the ids of previously created BigML models to make
        predictions for the `test_set`.

    """

    anomaly = None
    anomalies = None
    # no multi-label support at present

    # variables from command-line options
    resume = args.resume_
    anomaly_ids = args.anomaly_ids_
    output = args.predictions
    # there's only one anomaly detector to be generated at present
    args.max_parallel_anomalies = 1
    # anomalies cannot be published yet.
    args.public_anomaly = False

    # It is compulsory to have a description to publish either datasets or
    # anomalies
    if (not args.description_ and (args.public_anomaly or
                                   args.public_dataset)):
        sys.exit("You should provide a description to publish.")

    # When using --new-fields, it is compulsory to specify also a dataset
    # id
    if args.new_fields and not args.dataset:
        sys.exit("To use --new-fields you must also provide a dataset id"
                 " to generate the new dataset from it.")

    path = u.check_dir(output)
    session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required set the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        clear_log_files([log])
    # basic pre-model step: creating or retrieving the source related info
    source, resume, csv_properties, fields = pms.get_source_info(
        api, args, resume, csv_properties, session_file, path, log)
    # basic pre-model step: creating or retrieving the dataset related info
    dataset_properties = pms.get_dataset_info(
        api, args, resume, source,
        csv_properties, fields, session_file, path, log)
    (_, datasets, test_dataset, resume,
     csv_properties, fields) = dataset_properties
    if args.anomaly_file:
        # anomaly is retrieved from the contents of the given local JSON file
        anomaly, csv_properties, fields = u.read_local_resource(
            args.anomaly_file,
            csv_properties=csv_properties)
        anomalies = [anomaly]
        anomaly_ids = [anomaly['resource']]
    else:
        # anomaly is retrieved from the remote object
        anomalies, anomaly_ids, resume = pa.anomalies_processing(
            datasets, anomalies, anomaly_ids, api, args, resume, fields=fields,
            session_file=session_file, path=path, log=log)
    if anomalies:
        anomaly = anomalies[0]

    # We update the anomaly's public state if needed
    if anomaly:
        if not a.has_test(args) and not args.anomalies_dataset:
            query_string = MINIMUM_MODEL
        elif not a.has_test(args):
            query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS])
        else:
            query_string = r.ALL_FIELDS_QS
        try:
            anomaly_id = anomaly.get('resource', anomaly)
        except AttributeError:
            anomaly_id = anomaly
        anomaly = u.check_resource(anomaly_id,
                                   query_string=query_string,
                                   api=api)
        anomalies[0] = anomaly
        if (args.public_anomaly or
                (args.shared_flag and r.shared_changed(args.shared, anomaly))):
            anomaly_args = {}
            if args.shared_flag and r.shared_changed(args.shared, anomaly):
                anomaly_args.update(shared=args.shared)
            if args.public_anomaly:
                anomaly_args.update(r.set_publish_anomaly_args(args))
            if anomaly_args:
                anomaly = r.update_anomaly(anomaly, anomaly_args, args,
                                           api=api, path=path,
                                           session_file=session_file)
                anomalies[0] = anomaly

    # We get the fields of the anomaly detector if we haven't got
    # them yet and need them
    if anomaly and (args.test_set or args.export_fields):
        fields = pa.get_anomaly_fields(anomaly, csv_properties, args)

    # If creating a top anomalies excluded/included dataset
    if args.anomalies_dataset and anomaly:
        origin_dataset = anomaly['object'].get('dataset')
        if origin_dataset is None:
            sys.exit("The dataset used to generate the anomaly detector "
                     "cannot be found. Failed to generate the anomalies "
                     " dataset.")
        local_anomaly = Anomaly(anomaly)
        include = args.anomalies_dataset == ANOMALIES_IN
        args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include)
        _, resume = pd.create_new_dataset(
            origin_dataset, api, args, resume, fields=fields,
            session_file=session_file, path=path, log=log)
    # If predicting
    if anomaly and args.score:
        args.test_dataset = anomaly['object']['dataset']
    if anomalies and (a.has_test(args) or (test_dataset and args.remote)):
        # test dataset can be defined by --test-split or --test-dataset or
        # --test-datasets
        if test_dataset is None:
            test_dataset = get_test_dataset(args)
        # Remote anomaly scores: scores are computed as batch anomaly scores
        # in bigml.com except when --no-batch flag is set on
        if args.remote and not args.no_batch:
            # create test source from file
            test_name = "%s - test" % args.name
            if args.test_source is None:
                test_properties = ps.test_source_processing(
                    api, args, resume, name=test_name,
                    session_file=session_file, path=path, log=log)
                (test_source, resume,
                 csv_properties, test_fields) = test_properties
            else:
                test_source_id = bigml.api.get_source_id(args.test_source)
                test_source = api.check_resource(test_source_id)
            if test_dataset is None:
                # create test dataset from test source
                dataset_args = r.set_basic_dataset_args(args, name=test_name)
                test_dataset, resume = pd.alternative_dataset_processing(
                    test_source, "test", dataset_args, api, args,
                    resume, session_file=session_file, path=path, log=log)
            else:
                test_dataset_id = bigml.api.get_dataset_id(test_dataset)
                test_dataset = api.check_resource(test_dataset_id)
            test_fields = pd.get_fields_structure(test_dataset,
                                                  csv_properties)
            batch_anomaly_score_args = r.set_batch_anomaly_score_args(
                args, fields=fields,
                dataset_fields=test_fields)

            remote_anomaly_score(anomaly, test_dataset,
                                 batch_anomaly_score_args, args,
                                 api, resume, prediction_file=output,
                                 session_file=session_file, path=path, log=log)

        else:
            anomaly_score(anomalies, fields, args, session_file=session_file)

    if fields and args.export_fields:
        fields.summary_csv(os.path.join(path, args.export_fields))

    u.print_generated_files(path, log_file=session_file,
                            verbosity=args.verbosity)
    if args.reports:
        clear_reports(path)
        if args.upload:
            upload_reports(args.reports, path)
예제 #8
0
def compute_output(api, args):
    """ Creates one or more anomaly detectors using the `training_set`
        or uses the ids of previously created BigML models to make
        predictions for the `test_set`.

    """

    anomaly = None
    anomalies = None
    # no multi-label support at present

    # variables from command-line options
    resume = args.resume_
    anomaly_ids = args.anomaly_ids_
    output = args.predictions
    # there's only one anomaly detector to be generated at present
    args.max_parallel_anomalies = 1
    # anomalies cannot be published yet.
    args.public_anomaly = False

    # It is compulsory to have a description to publish either datasets or
    # anomalies
    if (not args.description_ and (args.public_anomaly or
                                   args.public_dataset)):
        sys.exit("You should provide a description to publish.")

    # When using --new-fields, it is compulsory to specify also a dataset
    # id
    if args.new_fields and not args.dataset:
        sys.exit("To use --new-fields you must also provide a dataset id"
                 " to generate the new dataset from it.")

    path = u.check_dir(output)
    session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required set the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        clear_log_files([log])
    # basic pre-model step: creating or retrieving the source related info
    source, resume, csv_properties, fields = pms.get_source_info(
        api, args, resume, csv_properties, session_file, path, log)
    # basic pre-model step: creating or retrieving the dataset related info
    dataset_properties = pms.get_dataset_info(
        api, args, resume, source,
        csv_properties, fields, session_file, path, log)
    (dataset, datasets, test_dataset, resume,
     csv_properties, fields) = dataset_properties
    if args.anomaly_file:
        # anomaly is retrieved from the contents of the given local JSON file
        anomaly, csv_properties, fields = u.read_local_resource(
            args.anomaly_file,
            csv_properties=csv_properties)
        anomalies = [anomaly]
        anomaly_ids = [anomaly['resource']]
    else:
        # anomaly is retrieved from the remote object
        anomalies, anomaly_ids, resume = pa.anomalies_processing(
            datasets, anomalies, anomaly_ids, api, args, resume, fields=fields,
            session_file=session_file, path=path, log=log)
    if anomalies:
        anomaly = anomalies[0]

    # We update the anomaly's public state if needed
    if anomaly:
        if not a.has_test(args) and not args.anomalies_dataset:
            query_string = MINIMUM_MODEL
        elif not a.has_test(args):
            query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS])
        else:
            query_string = r.ALL_FIELDS_QS
        try:
            anomaly_id = anomaly.get('resource', anomaly)
        except AttributeError:
            anomaly_id = anomaly
        anomaly = u.check_resource(anomaly_id,
                                   query_string=query_string,
                                   api=api)
        anomalies[0] = anomaly
        if (args.public_anomaly or
                (args.shared_flag and r.shared_changed(args.shared, anomaly))):
            anomaly_args = {}
            if args.shared_flag and r.shared_changed(args.shared, anomaly):
                anomaly_args.update(shared=args.shared)
            if args.public_anomaly:
                anomaly_args.update(r.set_publish_anomaly_args(args))
            if anomaly_args:
                anomaly = r.update_anomaly(anomaly, anomaly_args, args,
                                           api=api, path=path,
                                           session_file=session_file)
                anomalies[0] = anomaly

    # We get the fields of the anomaly detector if we haven't got
    # them yet and need them
    if anomaly and args.test_set:
        fields = pa.get_anomaly_fields(anomaly, csv_properties, args)

    # If creating a top anomalies excluded/included dataset
    if args.anomalies_dataset and anomaly:
        origin_dataset = anomaly['object'].get('dataset')
        if origin_dataset is None:
            sys.exit("The dataset used to generate the anomaly detector "
                     "cannot be found. Failed to generate the anomalies "
                     " dataset.")
        local_anomaly = Anomaly(anomaly)
        include = args.anomalies_dataset == ANOMALIES_IN
        args._anomaly_filter = local_anomaly.anomalies_filter(include=include)
        new_dataset, resume = pd.create_new_dataset(
            origin_dataset, api, args, resume, fields=fields,
            session_file=session_file, path=path, log=log)
    # If predicting
    if anomaly and args.score:
        args.test_dataset = anomaly['object']['dataset']
    if anomalies and (a.has_test(args) or (test_dataset and args.remote)):
        # test dataset can be defined by --test-split or --test-dataset or
        # --test-datasets
        if test_dataset is None:
            test_dataset = get_test_dataset(args)
        # Remote anomaly scores: scores are computed as batch anomaly scores
        # in bigml.com except when --no-batch flag is set on
        if args.remote and not args.no_batch:
            # create test source from file
            test_name = "%s - test" % args.name
            if args.test_source is None:
                test_properties = ps.test_source_processing(
                    api, args, resume, name=test_name,
                    session_file=session_file, path=path, log=log)
                (test_source, resume,
                 csv_properties, test_fields) = test_properties
            else:
                test_source_id = bigml.api.get_source_id(args.test_source)
                test_source = api.check_resource(test_source_id)
            if test_dataset is None:
                # create test dataset from test source
                dataset_args = r.set_basic_dataset_args(args, name=test_name)
                test_dataset, resume = pd.alternative_dataset_processing(
                    test_source, "test", dataset_args, api, args,
                    resume, session_file=session_file, path=path, log=log)
            else:
                test_dataset_id = bigml.api.get_dataset_id(test_dataset)
                test_dataset = api.check_resource(test_dataset_id)
            test_fields = pd.get_fields_structure(test_dataset,
                                                  csv_properties)
            batch_anomaly_score_args = r.set_batch_anomaly_score_args(
                args, fields=fields,
                dataset_fields=test_fields)

            remote_anomaly_score(anomaly, test_dataset,
                                 batch_anomaly_score_args, args,
                                 api, resume, prediction_file=output,
                                 session_file=session_file, path=path, log=log)

        else:
            anomaly_score(anomalies, fields, args, session_file=session_file)

    u.print_generated_files(path, log_file=session_file,
                            verbosity=args.verbosity)
    if args.reports:
        clear_reports(path)
        if args.upload:
            upload_reports(args.reports, path)
예제 #9
0
api = BigML(dev_mode=True)
model = api.get_model("model/563a1c7a3cd25747430023ce")
prediction = api.create_prediction(model, {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51})

local_model = Model("model/56430eb8636e1c79b0001f90", api=api)
prediction = local_model.predict(
    {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.52}, 2, add_confidence=True, multiple=3
)

local_model = Ensemble("ensemble/564a02d5636e1c79b5006e13", api=api)
local_model = Ensemble("ensemble/564a081bc6c19b6cf3011c60", api=api)
prediction = local_model.predict(
    {"petal length": 0.95, "sepal width": 3.9, "petal width": 1.51, "sepal length": 7.0}, method=2, add_confidence=True
)

local_ensemble = Ensemble("ensemble/564623d4636e1c79b00051f7", api=api)
prediction = local_ensemble.predict({"Price": 5.8, "Grape": "Pinot Grigio", "Country": "Italy", "Rating": 92}, True)

local_anomaly = Anomaly("anomaly/564c5a76636e1c3d52000007", api=api)
prediction = local_anomaly.anomaly_score(
    {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51, "sepal length": 6.02, "species": "Iris-setosa"},
    True,
)
prediction = local_anomaly.anomaly_score(
    {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51, "sepal length": 6.02, "species": "Iris-setosa"},
    True,
)
prediction = local_anomaly.anomaly_score({"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51}, True)

api.pprint(prediction)