def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }) referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) input_fields = [ resource_fields.field_name(field_id) for field_id in input_fields_ids ] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name(objective_id) referrer_input_fields.append([ name for name in referrer_fields_names if name != referrer_objective ]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields( {'resource': resource['resource'], 'object': resource}) referrer_fields = Fields( {'resource': referrer['resource'], 'object': referrer}) input_fields = [resource_fields.field_name(field_id) for field_id in input_fields_ids] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name( objective_id) referrer_input_fields.append([name for name in referrer_fields_names if name != referrer_objective]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}, "get": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score grandparent = parent if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) grandparents = u.get_origin_info(parent) # batch resources have two parents, choose the dataset if origin == "origin_batch_resource" and \ isinstance(grandparents, list): for gp_origin, grandparent in grandparents: if gp_origin == "dataset": break else: _, grandparent = grandparents grandparent = self.get_resource(grandparent) # options common to all model types call = "update" if origin == "origin_batch_resource" else "create" u.common_dataset_opts(child, grandparent, opts, call=call) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({ 'resource': child['resource'], 'object': child }) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() # if there's no preferred fields, use the fields structure if len(preferred_fields.keys()) == 0: preferred_fields = resource_fields.fields max_column = sorted([ field['column_number'] for _, field in preferred_fields.items() if field['optype'] != "text" ], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) if origin != "origin_batch_resource": # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) # for batch_predictions, batch_clusters, batch_anomalies generated # datasets, attributes cannot be set at creation time, so we # must update the resource instead suffix = None if origin == "origin_batch_resource": opts["update"].update(opts["create"]) opts["create"] = {} suffix = "['object']['output_dataset_resource']" calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix) self.add(resource_id, calls)
dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) try: excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] # headers are extended with a column per field fields_names = [fields.field_name(field_id) for field_id in field_ids] features_header.extend(fields_names) features_writer.writerow(features_header) initial_state = [False for field_id in field_ids] open_list = [(initial_state, - float('inf'), -float('inf'), 0)] closed_list = [] best_state, best_score, best_metric_value, best_counter = open_list[0] best_unchanged_count = 0 metric = args.optimize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list)
dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [ field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids ] field_ids.sort() # headers are extended with a column per field fields_names = [fields.field_name(field_id) for field_id in field_ids] features_header.extend(fields_names) features_writer.writerow(features_header) initial_state = [False for field_id in field_ids] open_list = [(initial_state, -float('inf'), -float('inf'), 0)] closed_list = [] best_state, best_score, best_metric_value, best_counter = open_list[0] best_unchanged_count = 0 metric = args.optimize while best_unchanged_count < staleness and open_list: loop_counter += 1
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}, "get": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score grandparent = parent if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) grandparents = u.get_origin_info(parent) # batch resources have two parents, choose the dataset if origin == "origin_batch_resource" and \ isinstance(grandparents, list): for gp_origin, grandparent in grandparents: if gp_origin == "dataset": break else: _, grandparent = grandparents grandparent = self.get_resource(grandparent) # options common to all model types call = "update" if origin == "origin_batch_resource" else "create" u.common_dataset_opts(child, grandparent, opts, call=call) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields( {'resource': child['resource'], 'object': child}) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() # if there's no preferred fields, use the fields structure if len(preferred_fields.keys()) == 0: preferred_fields = resource_fields.fields max_column = sorted([field['column_number'] for _, field in preferred_fields.items() if field['optype'] != "text"], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) if origin != "origin_batch_resource": # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) # for batch_predictions, batch_clusters, batch_anomalies generated # datasets, attributes cannot be set at creation time, so we # must update the resource instead suffix = None if origin == "origin_batch_resource": opts["update"].update(opts["create"]) opts["create"] = {} suffix = "['object']['output_dataset_resource']" calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix) self.add(resource_id, calls)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score if origin in ["origin_batch_resource", "cluster"]: if origin == "cluster": opts["create"].update({"centroid": child["centroid"]}) _, grandparent = u.get_origin_info(parent) grandparent = self.get_resource(grandparent) else: grandparent = parent # options common to all model types u.common_dataset_opts(child, grandparent, opts) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) dataset_defaults.update(COMMON_DEFAULTS.get("update", {})) for attribute, default_value in dataset_defaults.items(): opts["update"].update(u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u""] suffixes = [ u"filtered", u"sampled", u"dataset", u"extended", u"- batchprediction", u"- batchanomalyscore", u"- batchcentroid", u"- merged", ] autonames.extend([u"%s %s" % (grandparent.get("name", ""), suffix) for suffix in suffixes]) autonames.append(u"%s's dataset" % ".".join(parent["name"].split(".")[0:-1])) autonames.append(u"%s' dataset" % ".".join(parent["name"].split(".")[0:-1])) autonames.append(u"Cluster %s - %s" % (int(child.get("centroid", "0"), base=16), parent["name"])) autonames.append(u"Dataset from %s model - segment" % parent["name"]) u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({"resource": child["resource"], "object": child}) objective_id = child["objective_field"]["id"] preferred_fields = resource_fields.preferred_fields() max_column = sorted([field["column_number"] for _, field in preferred_fields.items()], reverse=True)[0] objective_column = resource_fields.fields[objective_id]["column_number"] if objective_column != max_column: opts["create"].update({"objective_field": {"id": objective_id}}) # resize if child["size"] != grandparent["size"] and get_resource_type(parent) == "source": opts["create"].update({"size": child["size"]}) # generated fields if child.get("new_fields", None): new_fields = child["new_fields"] for new_field in new_fields: new_field["field"] = new_field["generator"] del new_field["generator"] opts["create"].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) calls = u.build_calls(resource_id, [parent_id], opts) self.add(resource_id, calls)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) _, grandparent = u.get_origin_info(parent) grandparent = self.get_resource(grandparent) else: grandparent = parent # options common to all model types u.common_dataset_opts(child, grandparent, opts) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) dataset_defaults.update(COMMON_DEFAULTS.get("update", {})) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] suffixes = [ u"filtered", u"sampled", u"dataset", u"extended", u"- batchprediction", u"- batchanomalyscore", u"- batchcentroid", u"- merged" ] autonames.extend([ u'%s %s' % (grandparent.get('name', ''), suffix) for suffix in suffixes ]) autonames.append(u"%s's dataset" % '.'.join(parent['name'].split('.')[0:-1])) autonames.append(u"%s' dataset" % '.'.join(parent['name'].split('.')[0:-1])) autonames.append( u"Cluster %s - %s" % (int(child.get('centroid', "0"), base=16), parent['name'])) autonames.append(u"Dataset from %s model - segment" % parent['name']) u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({ 'resource': child['resource'], 'object': child }) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() max_column = sorted( [field['column_number'] for _, field in preferred_fields.items()], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) calls = u.build_calls(resource_id, [parent_id], opts) self.add(resource_id, calls)
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath(os.path.join(args.output_dir, FEATURES_LOG)) with open(features_file, 'w', 0) as features_handler: features_writer = csv.writer(features_handler, lineterminator="\n") features_writer.writerow([ "step", "state", "score", "metric_value", "best_score"]) features_handler.flush() if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) dataset = api.check_resource(dataset_id, api.get_dataset) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] initial_state = [False for field_id in field_ids] open_list = [(initial_state, - float('inf'), -float('inf'))] closed_list = [] best_state, best_score, best_metric_value = open_list[0] best_unchanged_count = 0 metric = args.maximize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list) state, score, metric_value = features_set features_writer.writerow([ loop_counter, [int(in_set) for in_set in state], score, metric_value, best_score]) features_handler.flush() state_fields = [fields.field_name(field_ids[index]) for (index, in_set) in enumerate(state) if in_set] closed_list.append(features_set) open_list.remove(features_set) if (score - EPSILON) > best_score: best_state, best_score, best_metric_value = features_set best_unchanged_count = 0 if state_fields: message = 'New best state: %s\n' % (state_fields) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % ( metric.capitalize(),metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 children = expand_state(state) for child in children: if (child not in [state for state, _, _ in open_list] and child not in [state for state, _, _ in closed_list]): input_fields = [fields.field_name(field_id) for (i, field_id) in enumerate(field_ids) if child[i]] # create models and evaluation with input_fields args.model_fields = args.args_separator.join(input_fields) counter += 1 (score, metric_value, metric, resume) = kfold_evaluate(datasets_file, args, counter, common_options, penalty=penalty, resume=resume, metric=metric) open_list.append((child, score, metric_value)) best_features = [fields.field_name(field_ids[i]) for (i, score) in enumerate(best_state) if score] message = (u'The best feature subset is: %s \n' % u", ".join(best_features)) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = (u'%s = %0.2f%%\n' % (metric.capitalize(), (best_metric_value * 100))) else: message = (u'%s = %f\n' % (metric.capitalize(), best_metric_value)) u.log_message(message, log_file=session_file, console=1) message = (u'Evaluated %d/%d feature subsets\n' % ((len(open_list) + len(closed_list)), 2 ** len(field_ids))) u.log_message(message, log_file=session_file, console=1)