def get_fields_structure(resource, errors=False): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") field_errors = None if resource_type in RESOURCES_WITH_FIELDS: resource = resource.get('object', resource) # locale and missing tokens if resource_type == SOURCE_PATH: resource_locale = resource['source_parser']['locale'] missing_tokens = resource[ 'source_parser']['missing_tokens'] else: resource_locale = resource.get('locale', DEFAULT_LOCALE) missing_tokens = resource.get('missing_tokens', DEFAULT_MISSING_TOKENS) # fields structure if resource_type in [MODEL_PATH, ANOMALY_PATH]: fields = resource['model']['fields'] elif resource_type == CLUSTER_PATH: fields = resource['clusters']['fields'] elif resource_type == CORRELATION_PATH: fields = resource['correlations']['fields'] elif resource_type == STATISTICAL_TEST_PATH: fields = resource['statistical_tests']['fields'] elif resource_type == LOGISTIC_REGRESSION_PATH: fields = resource['logistic_regression']['fields'] elif resource_type == ASSOCIATION_PATH: fields = resource['associations']['fields'] elif resource_type == TOPIC_MODEL_PATH: fields = resource['topic_model']['fields'] elif resource_type == SAMPLE_PATH: fields = dict([(field['id'], field) for field in resource['sample']['fields']]) else: fields = resource['fields'] # Check whether there's an objective id objective_column = None if resource_type == DATASET_PATH: objective_column = resource.get( \ 'objective_field', {}).get('id') if errors: field_errors = resource.get("status", {}).get("field_errors") elif resource_type in [MODEL_PATH, LOGISTIC_REGRESSION_PATH]: objective_id = resource.get( \ 'objective_fields', [None])[0] objective_column = fields.get( \ objective_id, {}).get('column_number') result = fields, resource_locale, missing_tokens, objective_column if errors: result = result + (field_errors,) return result else: return (None, None, None, None, None) if errors else \ (None, None, None, None)
def get_fields_structure(resource): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") if resource_type in RESOURCES_WITH_FIELDS: # locale and missing tokens if resource_type == SOURCE_TYPE: resource_locale = resource['object']['source_parser']['locale'] missing_tokens = resource['object'][ 'source_parser']['missing_tokens'] else: resource_locale = resource['object'].get('locale', DEFAULT_LOCALE) missing_tokens = resource['object'].get('missing_tokens', DEFAULT_MISSING_TOKENS) # fields structure if resource_type in [MODEL_TYPE, ANOMALY_TYPE]: fields = resource['object']['model']['fields'] elif resource_type == CLUSTER_TYPE: fields = resource['object']['clusters']['fields'] else: fields = resource['object']['fields'] return fields, resource_locale, missing_tokens else: return None, None, None
def get_fields_structure(resource): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") if resource_type in RESOURCES_WITH_FIELDS: if resource_type == SOURCE_TYPE: resource_locale = resource['object']['source_parser']['locale'] missing_tokens = resource['object'][ 'source_parser']['missing_tokens'] else: resource_locale = resource['object']['locale'] missing_tokens = resource['object']['missing_tokens'] if resource_type == MODEL_TYPE: fields = resource['object']['model']['fields'] else: fields = resource['object']['fields'] return fields, resource_locale, missing_tokens else: return None, None, None
def retrieve_resource(api, resource_id, query_string=ONLY_MODEL, no_check_fields=False): """ Retrieves resource info either from a local repo or from the remote server """ if api.storage is not None: try: stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) # we check that the stored resource has enough fields information # for local predictions to work. Otherwise we should retrieve it. if no_check_fields or check_model_fields(resource): return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass if api.auth == '?username=;api_key=;': raise ValueError("The credentials information is missing. This" " information is needed to download resource %s" " for the first time and store it locally for further" " use. Please export BIGML_USERNAME" " and BIGML_API_KEY." % resource_id) api_getter = api.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string) return resource
def retrieve_resource(api, resource_id, query_string='limit=-1;', no_check_fields=False): """ Retrieves resource info either from a local repo or from the remote server """ if api.storage is not None: try: stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) # we check that the stored resource has enough fields information # for local predictions to work. Otherwise we should retrieve it. if no_check_fields or check_model_fields(resource): return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass api_getter = api.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string) return resource
def get_fields_structure(resource): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") if resource_type in RESOURCES_WITH_FIELDS: # locale and missing tokens if resource_type == SOURCE_TYPE: resource_locale = resource['object']['source_parser']['locale'] missing_tokens = resource['object'][ 'source_parser']['missing_tokens'] else: resource_locale = resource['object'].get('locale', DEFAULT_LOCALE) missing_tokens = resource['object'].get('missing_tokens', DEFAULT_MISSING_TOKENS) # fields structure if resource_type in [MODEL_TYPE, ANOMALY_TYPE]: fields = resource['object']['model']['fields'] elif resource_type == CLUSTER_TYPE: fields = resource['object']['clusters']['fields'] elif resource_type == SAMPLE_TYPE: fields = dict([(field['id'], field) for field in resource['object']['sample']['fields']]) else: fields = resource['object']['fields'] return fields, resource_locale, missing_tokens else: return None, None, None
def __init__(self, model, api=None): resource_id, model = extract_id(model) resource_type = get_resource_type(resource_id) local_model = COMPONENT_CLASSES[resource_type](model, api=api) self.__class__.__bases__ = local_model.__class__.__bases__ for attr, value in local_model.__dict__.items(): setattr(self, attr, value) self.local_model = local_model
def __init__(self, resource_or_fields, missing_tokens=None, data_locale=None, verbose=False, objective_field=None, objective_field_present=False, include=None, errors=None): # The constructor can be instantiated with resources or a fields # structure. The structure is checked and fields structure is returned # if a resource type is matched. try: self.resource_type = get_resource_type(resource_or_fields) resource_info = get_fields_structure(resource_or_fields, True) (self.fields, resource_locale, resource_missing_tokens, objective_column, resource_errors) = resource_info if data_locale is None: data_locale = resource_locale if missing_tokens is None: if resource_missing_tokens: missing_tokens = resource_missing_tokens if errors is None: errors = resource_errors except ValueError: # If the resource structure is not in the expected set, fields # structure is assumed self.fields = resource_or_fields if data_locale is None: data_locale = DEFAULT_LOCALE if missing_tokens is None: missing_tokens = DEFAULT_MISSING_TOKENS objective_column = None if self.fields is None: raise ValueError("No fields structure was found.") self.fields_by_name = invert_dictionary(self.fields, 'name') self.fields_by_column_number = invert_dictionary(self.fields, 'column_number') find_locale(data_locale, verbose) self.missing_tokens = missing_tokens self.fields_columns = sorted(self.fields_by_column_number.keys()) # Ids of the fields to be included self.filtered_fields = (list(self.fields.keys()) if include is None else include) # To be updated in update_objective_field self.row_ids = None self.headers = None self.objective_field = None self.objective_field_present = None self.filtered_indexes = None self.field_errors = errors # if the objective field is not set by the user # use the one extracted from the resource info if objective_field is None and objective_column is not None: objective_field = objective_column objective_field_present = True self.update_objective_field(objective_field, objective_field_present)
def share_resource(api, resource): """Creates a secret link to share the resource. """ resource_type = get_resource_type(resource) resource = get_updater(api, resource_type)(resource, {"shared": True}) if api.ok(resource) and is_shared(resource): return ("https://bigml.com/shared/%s/%s" % (resource_type, resource['object']['shared_hash'])) else: sys.exit("Failed to share the resource: %s" % resource['resource'])
def __init__(self, model, api=None): self.api = get_api_connection(api) resource_id, model = extract_id(model, api) resource_type = get_resource_type(resource_id) kwargs = {"api": self.api} local_model = COMPONENT_CLASSES[resource_type](model, **kwargs) self.__class__.__bases__ = local_model.__class__.__bases__ for attr, value in local_model.__dict__.items(): setattr(self, attr, value) self.local_model = local_model
def __init__(self, model, api=None): if api is None: api = BigML(storage=STORAGE) resource_id, model = extract_id(model, api) resource_type = get_resource_type(resource_id) kwargs = {"api": api} local_model = COMPONENT_CLASSES[resource_type](model, **kwargs) self.__class__.__bases__ = local_model.__class__.__bases__ for attr, value in local_model.__dict__.items(): setattr(self, attr, value) self.local_model = local_model
def get_fields_structure(resource, errors=False): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") field_errors = None if resource_type in RESOURCES_WITH_FIELDS: resource = resource.get('object', resource) # locale and missing tokens if resource_type == SOURCE_PATH: resource_locale = resource['source_parser']['locale'] missing_tokens = resource[ 'source_parser']['missing_tokens'] else: resource_locale = resource.get('locale', DEFAULT_LOCALE) missing_tokens = resource.get('missing_tokens', DEFAULT_MISSING_TOKENS) # fields structure if resource_type in FIELDS_PARENT.keys(): fields = resource[FIELDS_PARENT[resource_type]].get('fields', {}) else: fields = resource.get('fields', {}) if resource_type == SAMPLE_PATH: fields = dict([(field['id'], field) for field in fields]) # Check whether there's an objective id objective_column = None if resource_type == DATASET_PATH: objective_column = resource.get( \ 'objective_field', {}).get('id') if errors: field_errors = resource.get("status", {}).get("field_errors") elif resource_type in [MODEL_PATH, LOGISTIC_REGRESSION_PATH]: objective_id = resource.get( \ 'objective_fields', [None])[0] objective_column = fields.get( \ objective_id, {}).get('column_number') result = fields, resource_locale, missing_tokens, objective_column if errors: result = result + (field_errors,) return result else: return (None, None, None, None, None) if errors else \ (None, None, None, None)
def get_fields_structure(resource): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") if resource_type in RESOURCES_WITH_FIELDS: # locale and missing tokens if resource_type == SOURCE_TYPE: resource_locale = resource['object']['source_parser']['locale'] missing_tokens = resource['object'][ 'source_parser']['missing_tokens'] else: resource_locale = resource['object'].get('locale', DEFAULT_LOCALE) missing_tokens = resource['object'].get('missing_tokens', DEFAULT_MISSING_TOKENS) # fields structure if resource_type in [MODEL_TYPE, ANOMALY_TYPE]: fields = resource['object']['model']['fields'] elif resource_type == CLUSTER_TYPE: fields = resource['object']['clusters']['fields'] elif resource_type == CORRELATION_TYPE: fields = resource['object']['correlations']['fields'] elif resource_type == STATISTICAL_TEST_TYPE: fields = resource['object']['statistical_tests']['fields'] elif resource_type == SAMPLE_TYPE: fields = dict([(field['id'], field) for field in resource['object']['sample']['fields']]) else: fields = resource['object']['fields'] # Check whether there's an objective id objective_column = None if resource_type == DATASET_TYPE: objective_column = resource['object'].get( \ 'objective_field', {}).get('id') elif resource_type == MODEL_TYPE: objective_id = resource['object'].get( \ 'objective_fields', [None])[0] objective_column = fields.get( \ objective_id, {}).get('column_number') return fields, resource_locale, missing_tokens, objective_column else: return None, None, None, None
def retrieve_resource(api, resource_id, query_string=""): """ Retrieves resource info either from a local repo or from the remote server """ if api.storage is not None: try: stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass api_getter = api.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string) return resource
def retrieve_resource(api, resource_id, query_string=''): """ Retrieves resource info either from a local repo or from the remote server """ if api.storage is not None: try: stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass api_getter = api.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string) return resource
def delete(object_list): """Deletes the objects in object_list using the api delete method """ for obj_id in object_list: counter = 0 delete_method = world.api.deleters[get_resource_type(obj_id)] result = delete_method(obj_id) while result['code'] != HTTP_NO_CONTENT and counter < MAX_RETRIES: print ("Failed to delete %s with code %s. Retrying." % (obj_id, result['code'])) time.sleep(3) counter += 1 result = delete_method(obj_id) if counter == MAX_RETRIES: print ("Retries to delete the created resources are exhausted." " Failed to delete.") object_list = []
def get_fields_structure(resource, errors=False): """Returns the field structure for a resource, its locale and missing_tokens """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure") field_errors = None resource = resource.get('object', resource) # locale and missing tokens if resource_type == SOURCE_PATH: resource_locale = resource['source_parser']['locale'] missing_tokens = resource[ 'source_parser']['missing_tokens'] else: resource_locale = resource.get('locale', DEFAULT_LOCALE) missing_tokens = resource.get('missing_tokens', DEFAULT_MISSING_TOKENS) fields = get_fields(resource) if resource_type in RESOURCES_WITH_FIELDS: # Check whether there's an objective id objective_column = None if resource_type == DATASET_PATH: objective_column = resource.get( \ 'objective_field', {}).get('id') if errors: field_errors = resource.get("status", {}).get("field_errors") elif resource_type in SUPERVISED_PATHS and \ resource_type != FUSION_PATH: objective_id = resource.get( \ 'objective_fields', [None])[0] objective_column = fields.get( \ objective_id, {}).get('column_number') result = fields, resource_locale, missing_tokens, objective_column if errors: result = result + (field_errors,) return result return (None, None, None, None, None) if errors else \ (None, None, None, None)
def retrieve_resource(api, resource_id, query_string=ONLY_MODEL, no_check_fields=False): """ Retrieves resource info either from a local repo or from the remote server """ if api.storage is not None: try: stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) # we check that the stored resource has enough fields information # for local predictions to work. Otherwise we should retrieve it. if no_check_fields or check_model_fields(resource): return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass api_getter = api.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string) return resource
def predict_probability(self, input_data, missing_strategy=LAST_PREDICTION, compact=False): """For classification models, Predicts a probability for each possible output class, based on input values. The input fields must be a dictionary keyed by field name or field ID. For regressions, the output is a single element list containing the prediction. :param input_data: Input data to be predicted :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields :param compact: If False, prediction is returned as a list of maps, one per class, with the keys "prediction" and "probability" mapped to the name of the class and it's probability, respectively. If True, returns a list of probabilities ordered by the sorted order of the class names. """ votes = MultiVoteList([]) if not self.missing_numerics: check_no_missing_numerics(input_data, self.fields) for models_split in self.models_splits: models = [] for model in models_split: if get_resource_type(model) == "fusion": models.append(Fusion(model, api=self.api)) else: models.append(SupervisedModel(model, api=self.api)) votes_split = [] for model in models: try: prediction = model.predict_probability( \ input_data, missing_strategy=missing_strategy, compact=True) except ValueError: # logistic regressions can raise this error if they # have missing_numerics=False and some numeric missings # are found continue if self.regression: prediction = prediction[0] if self.weights is not None: prediction = self.weigh(prediction, model.resource_id) else: if self.weights is not None: prediction = self.weigh( \ prediction, model.resource_id) # we need to check that all classes in the fusion # are also in the composing model if not self.regression and \ self.class_names != model.class_names: try: prediction = rearrange_prediction( \ model.class_names, self.class_names, prediction) except AttributeError: # class_names should be defined, but just in case pass votes_split.append(prediction) votes.extend(votes_split) if self.regression: total_weight = len(votes.predictions) if self.weights is None \ else sum(self.weights) prediction = sum([prediction for prediction in \ votes.predictions]) / float(total_weight) if compact: output = [prediction] else: output = {"prediction": prediction} else: output = votes.combine_to_distribution(normalize=True) if not compact: output = [{'category': class_name, 'probability': probability} for class_name, probability in zip(self.class_names, output)] return output
def __init__(self, fusion, api=None, max_models=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.resource_id = None self.models_ids = None self.objective_id = None self.distribution = None self.models_splits = [] self.cache_get = None self.regression = False self.fields = None self.class_names = None self.importance = {} self.resource_id, fusion = get_resource_dict( \ fusion, "fusion", api=self.api) if 'object' in fusion: fusion = fusion.get('object', {}) self.model_ids, self.weights = get_models_weight( \ fusion['models']) model_types = [get_resource_type(model) for model in self.model_ids] for model_type in model_types: if model_type not in LOCAL_SUPERVISED: raise ValueError("The resource %s has not an allowed" " supervised model type.") self.importance = fusion.get('importance', []) self.missing_numerics = fusion.get('missing_numerics', True) if fusion.get('fusion'): self.fields = fusion.get( \ 'fusion', {}).get("fields") self.objective_id = fusion.get("objective_field") number_of_models = len(self.model_ids) # Downloading the model information to cache it if self.api.storage is not None: for model_id in self.model_ids: if get_resource_type(model_id) == "fusion": Fusion(model_id, api=self.api) else: SupervisedModel(model_id, api=self.api) if max_models is None: self.models_splits = [self.model_ids] else: self.models_splits = [self.model_ids[index:(index + max_models)] for index in range(0, number_of_models, max_models)] if self.fields: summary = self.fields[self.objective_id]['summary'] if 'bins' in summary: distribution = summary['bins'] elif 'counts' in summary: distribution = summary['counts'] elif 'categories' in summary: distribution = summary['categories'] else: distribution = [] self.distribution = distribution self.regression = \ self.fields[self.objective_id].get('optype') == 'numeric' if not self.regression: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] self.class_names = sorted(classes) self.objective_categories = [category for \ category, _ in self.fields[self.objective_id][ \ "summary"]["categories"]] ModelFields.__init__( \ self, self.fields, objective_id=self.objective_id)
def retrain_model(args, api, command, session_file=None): """Retrieve or create the retrain script for a model and execute it with the new provided data """ retrain_file = os.path.join(BIGMLER_SCRIPTS_DIRECTORY, "retrain", "scripts") try: os.remove(UPGRADE_FILE) reify_script = None try: shutil.rmtree(BIGMLER_SCRIPTS_DIRECTORY) except OSError: pass except OSError: # look for the script that creates the rebuild script. reify_script = get_script_id(retrain_file) if reify_script is None: # new bigmler command: creating the scriptify scripts whizzml_command = ['whizzml', '--package-dir', INCREMENTAL_PACKAGE_PATH, '--output-dir', BIGMLER_SCRIPTS_DIRECTORY] add_api_context(whizzml_command, args) whizzml_dispatcher(args=whizzml_command) reify_script = get_script_id(retrain_file) # retrieve the modeling resource to be retrained by tag or id if args.resource_id: resource_id = args.resource_id reference_tag = "retrain:%s" % resource_id else: for model_type in MODEL_TYPES: if hasattr(args, "%s_tag" % model_type) and \ getattr(args, "%s_tag" % model_type) is not None: tag = getattr(args, "%s_tag" % model_type) query_string = "tags=%s" % tag resource_id = get_first_resource( \ model_type.replace("_", ""), api=api, query_string=query_string) if resource_id is None: sys.exit("Failed to find the %s with tag %s. " "Please, check the tag and" " the connection info (domain and credentials)." % (model_type.replace("_", " "), tag)) reference_tag = tag break # updating the dataset that generated the model with the reference tag model = api.getters[get_resource_type(resource_id)](resource_id) dataset_id = model["object"]["dataset"] dataset = api.get_dataset(dataset_id) tags = dataset["object"]["tags"] if reference_tag not in tags: tags.append(reference_tag) api.update_dataset(dataset_id, {"tags": tags}) # if --upgrade, we force rebuilding the scriptified script if args.upgrade: script_id = None else: # check for the last script used to retrain the model query_string = "tags=%s" % reference_tag script_id = get_last_resource( \ "script", api=api, query_string=query_string) if script_id is None: # if the script to retrain does not exist: # check whether the model exists try: bigml.api.check_resource(resource_id, raise_on_error=True, api=api) except Exception, exc: sys.exit("Failed to find the model %s. Please, check its ID and" " the connection info (domain and credentials)." % resource_id) # new bigmler command: creating the retrain script execute_command = ['execute', '--script', reify_script, '--tag', reference_tag, '--output-dir', args.output_dir] command.propagate(execute_command) command_args, _, _, exe_session_file, _ = get_context(execute_command, EXE_SETTINGS) command_args.arguments_ = [["model-resource", resource_id]] command_args.inputs = json.dumps(command_args.arguments_) # process the command execute_whizzml(command_args, api, session_file) script_id = extract_retrain_id(command_args, api, session_file)