def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }) referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) input_fields = [ resource_fields.field_name(field_id) for field_id in input_fields_ids ] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name(objective_id) referrer_input_fields.append([ name for name in referrer_fields_names if name != referrer_objective ]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def create_execution(self, origin_resource, args=None, wait_time=3, retries=10): """Creates an execution from a `script` or a list of `scripts`. """ create_args = {} if args is not None: create_args.update(args) if (isinstance(origin_resource, basestring) or isinstance(origin_resource, dict)): # single script scripts = [origin_resource] else: scripts = origin_resource try: script_ids = [get_script_id(script) for script in scripts] except TypeError: raise Exception("A script id or a list of them is needed to create" " a script execution. %s found." % get_resource_type(origin_resource)) if all([ get_resource_type(script_id) == SCRIPT_PATH for script_id in script_ids ]): for script in scripts: check_resource(script, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A script id or a list of them is needed to create" " a script execution. %s found." % get_resource_type(origin_resource)) if len(scripts) > 1: create_args.update({"scripts": script_ids}) else: create_args.update({"script": script_ids[0]}) body = json.dumps(create_args) return self._create(self.execution_url, body)
def create_projection(self, pca, input_data=None, args=None, wait_time=3, retries=10): """Creates a new projection. The pca parameter can be a pca resource or ID """ pca_id = None resource_type = get_resource_type(pca) if resource_type != PCA_PATH: raise Exception("A PCA resource id is needed" " to create a projection. %s found." % resource_type) pca_id = get_resource_id(pca) if pca_id is not None: check_resource(pca_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) if pca_id is not None: create_args.update({ "pca": pca_id}) body = json.dumps(create_args) return self._create(self.projection_url, body, verify=self.verify)
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_fields = Fields( {'resource': referrer['resource'], 'object': referrer}) referrer_fields_ids = referrer_fields.fields.keys() # case where objective field is not in input fields # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass if objective_id not in input_fields_ids: input_fields_ids.append(objective_id) if input_fields_ids.sort() == referrer_fields_ids.sort(): return [] return input_fields_ids
def check_model_fields(model): """Checks the model structure to see whether it contains the required fields information """ inner_key = FIELDS_PARENT.get(get_resource_type(model), 'model') if check_model_structure(model, inner_key): model = model.get('object', model) fields = model.get("fields", model.get(inner_key, {}).get('fields')) input_fields = model.get("input_fields") # models only need model_fields to work. The rest of resources will # need all fields to work model_fields = model.get(inner_key, {}).get( \ 'model_fields', {}).keys() # fusions don't have input fields if input_fields is None and inner_key != "fusion": return False if not model_fields: fields_meta = model.get('fields_meta', \ model.get(inner_key, {}).get('fields_meta', {})) try: return fields_meta['count'] == fields_meta['total'] except KeyError: # stored old models will not have the fields_meta info, so # we return True to avoid failing in this case return True else: if fields is None: return False return all([field_id in fields.keys() \ for field_id in model_fields]) return False
def check_model_fields(model): """Checks the model structure to see whether it contains the required fields information """ inner_key = FIELDS_PARENT.get(get_resource_type(model), 'model') if check_model_structure(model, inner_key): model = model.get('object', model) fields = model.get("fields", model.get(inner_key, {}).get('fields')) # models only need model_fields to work. The rest of resources will # need all fields to work model_fields = model.get(inner_key, {}).get( \ 'model_fields', {}).keys() if not model_fields: fields_meta = model.get('fields_meta', \ model.get(inner_key, {}).get('fields_meta', {})) try: return fields_meta['count'] == fields_meta['total'] except KeyError: # stored old models will not have the fields_meta info, so # we return True to avoid failing in this case return True else: if fields is None: return False return all([field_id in fields.keys() \ for field_id in model_fields]) return False
def reify_python(self, alias=None): """REST call command line in python. See ``reify`` method. """ resource_type = get_resource_type(self.resource_id) resource_name = resource_alias(self.resource_id, alias) resource_method_suffix = RENAMED_RESOURCES.get( resource_type, resource_type) origin_names = [resource_alias(resource_id, alias) for resource_id in self.origins] arguments = ", ".join(origin_names) if self.suffix: arguments = "%s%s" % (arguments, self.suffix) if self.input_data: arguments = "%s, \\\n%s%s" % ( \ arguments, INDENT, pprint.pformat(self.input_data).replace("\n", "\n%s" % INDENT)) if self.args: sort_lists(self.args) arguments = "%s, \\\n%s%s" % (arguments, \ INDENT, \ pprint.pformat(self.args).replace( \ "\n", "\n%s" % INDENT)) out = "%s = api.%s_%s(%s)\napi.ok(%s)\n\n" % ( resource_name, self.action, resource_method_suffix, arguments, resource_name) return out
def create_centroid(self, cluster, input_data=None, args=None, wait_time=3, retries=10): """Creates a new centroid. """ cluster_id = None resource_type = get_resource_type(cluster) if resource_type == CLUSTER_PATH: cluster_id = get_cluster_id(cluster) check_resource(cluster_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A cluster id is needed to create a" " centroid. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) create_args.update({ "cluster": cluster_id}) body = json.dumps(create_args) return self._create(self.centroid_url, body, verify=self.verify)
def create_forecast(self, time_series, input_data=None, args=None, wait_time=3, retries=10): """Creates a new forecast. """ time_series_id = get_time_series_id(time_series) resource_type = get_resource_type(time_series_id) if resource_type == TIME_SERIES_PATH and time_series_id is not None: check_resource(time_series_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A time series model id is needed to create a" " forecast. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) if time_series_id is not None: create_args.update({ "timeseries": time_series_id}) body = json.dumps(create_args) return self._create(self.forecast_url, body, verify=self.verify_prediction)
def create_association_set(self, association, input_data=None, args=None, wait_time=3, retries=10): """Creates a new association set. """ association_id = None resource_type = get_resource_type(association) if resource_type == ASSOCIATION_PATH: association_id = get_association_id(association) check_resource(association_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A association id is needed to create an" " association set. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) create_args.update({ "association": association_id}) body = json.dumps(create_args) return self._create(self.association_set_url, body, verify=self.verify)
def reify_python(self, alias=None): """REST call command line in python. See ``reify`` method. """ resource_type = get_resource_type(self.resource_id) resource_name = resource_alias(self.resource_id, alias) resource_method_suffix = RENAMED_RESOURCES.get(resource_type, resource_type) origin_names = [ resource_alias(resource_id, alias) for resource_id in self.origins ] arguments = ", ".join(origin_names) if self.suffix: arguments = "%s%s" % (arguments, self.suffix) if self.input_data: arguments = "%s, \\\n%s%s" % ( \ arguments, INDENT, pprint.pformat(self.input_data).replace("\n", "\n%s" % INDENT)) if self.args: sort_lists(self.args) arguments = "%s, \\\n%s%s" % (arguments, \ INDENT, \ pprint.pformat(self.args).replace( \ "\n", "\n%s" % INDENT)) out = "%s = api.%s_%s(%s)\napi.ok(%s)\n\n" % ( resource_name, self.action, resource_method_suffix, arguments, resource_name) return out
def create_anomaly_score(self, anomaly, input_data=None, args=None, wait_time=3, retries=10): """Creates a new anomaly score. """ anomaly_id = None resource_type = get_resource_type(anomaly) if resource_type == ANOMALY_PATH: anomaly_id = get_anomaly_id(anomaly) check_resource(anomaly_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("An anomaly detector id is needed to create an" " anomaly score. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) create_args.update({ "anomaly": anomaly_id}) body = json.dumps(create_args) return self._create(self.anomaly_score_url, body, verify=self.verify)
def create_correlation(self, dataset, args=None, wait_time=3, retries=10): """Creates a correlation from a `dataset`. """ dataset_id = None resource_type = get_resource_type(dataset) if resource_type == DATASET_PATH: dataset_id = get_dataset_id(dataset) check_resource(dataset_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A dataset id is needed to create a" " correlation. %s found." % resource_type) create_args = {} if args is not None: create_args.update(args) create_args.update({"dataset": dataset_id}) body = json.dumps(create_args) return self._create(self.correlation_url, body)
def create_anomaly_score(self, anomaly, input_data=None, args=None, wait_time=3, retries=10): """Creates a new anomaly score. """ anomaly_id = None resource_type = get_resource_type(anomaly) if resource_type == ANOMALY_PATH: anomaly_id = get_anomaly_id(anomaly) check_resource(anomaly_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("An anomaly detector id is needed to create an" " anomaly score. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({"input_data": input_data}) create_args.update({"anomaly": anomaly_id}) body = json.dumps(create_args) return self._create(self.anomaly_score_url, body, verify=self.verify)
def create_topic_distribution(self, topic_model, input_data=None, args=None, wait_time=3, retries=10): """Creates a new topic distribution. """ topic_model_id = get_topic_model_id(topic_model) if topic_model_id is not None: check_resource(topic_model_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: resource_type = get_resource_type(topic_model) raise Exception("A topic model id is needed to create a" " topic distribution. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data, "topicmodel": topic_model_id}) body = json.dumps(create_args) return self._create(self.topic_distribution_url, body, verify=self.verify_prediction)
def retrieve_resource(self, resource_id, query_string=None, check_local_fn=None, retries=None): """ Retrieves resource info either from the local repo or from the remote server """ if query_string is None: query_string = '' if self.storage is not None: try: stored_resource = os.path.join(self.storage, resource_id.replace("/", "_")) with open(stored_resource) as resource_file: resource = json.loads(resource_file.read()) # we check that the stored resource has the information # needed (for instance, input_fields for predicting) if check_local_fn is None or check_local_fn(resource): return resource except ValueError: raise ValueError("The file %s contains no JSON") except IOError: pass if self.auth == '?username=;api_key=;': raise ValueError("The credentials information is missing. This" " information is needed to download resource %s" " for the first time and store it locally for further" " use. Please export BIGML_USERNAME" " and BIGML_API_KEY." % resource_id) api_getter = self.getters[get_resource_type(resource_id)] resource = check_resource(resource_id, api_getter, query_string, retries=retries) return resource
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) referrer_fields_ids = referrer_fields.fields.keys() # case where objective field is not in input fields # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass if objective_id not in input_fields_ids: input_fields_ids.append(objective_id) if input_fields_ids.sort() == referrer_fields_ids.sort(): return [] return input_fields_ids
def get_fields(self, resource): """Retrieve fields used by a resource. Returns a dictionary with the fields that uses the resource keyed by Id. """ if isinstance(resource, dict) and 'resource' in resource: resource_id = resource['resource'] elif isinstance(resource, basestring) and get_resource_type(resource) \ in RESOURCES_WITH_FIELDS: resource_id = resource resource = self.retrieve_resource(resource, query_string=ALL_FIELDS) else: LOGGER.error("Wrong resource id") return # Tries to extract fields information from resource dict. If it fails, # a get remote call is used to retrieve the resource by id. fields = None try: fields = get_fields(resource) except KeyError: resource = self._get("%s%s" % (self.url, resource_id)) fields = get_fields(resource) return fields
def get_fields_changes(resource, referrer=None, updatable_attrs=DEFAULT_UPDATABLE): """Changed field attributes """ if referrer is None: referrer = {} fields_attributes = {} resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }).fields resource_type = get_resource_type(resource) # for sources, extract all the updatable attributes if get_resource_type(resource) == 'source': updatable_attrs = SOURCE_UPDATABLE for field_id in resource_fields.keys(): field_opts = {} field = resource_fields[field_id] for attribute in updatable_attrs: if field.get(attribute): field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes # for the rest of resources, check which attributes changed if referrer: referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }).fields for field_id in resource_fields.keys(): field_opts = {} if not field_id in referrer_fields.keys(): continue field = resource_fields[field_id] for attribute in updatable_attrs: ref_values = ["", referrer_fields[field_id].get(attribute, "")] if not field.get(attribute, "") in ref_values: field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes
def non_default_opts(resource, opts, call="create"): """Stores the options that are not constant defaults """ resource_type = get_resource_type(resource) defaults = DEFAULTS[resource_type].get(call, {}) for attribute, default_value in defaults.items(): opts[call].update(default_setting(resource, attribute, *default_value))
def create_prediction(self, model, input_data=None, args=None, wait_time=3, retries=10, by_name=True): """Creates a new prediction. The model parameter can be: - a simple tree model - a simple logistic regression model - an ensemble The by_name argument is now deprecated. It will be removed. """ logistic_regression_id = None ensemble_id = None model_id = None resource_type = get_resource_type(model) if resource_type == ENSEMBLE_PATH: ensemble_id = get_ensemble_id(model) if ensemble_id is not None: check_resource(ensemble_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) elif resource_type == MODEL_PATH: model_id = get_model_id(model) check_resource(model_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) elif resource_type == LOGISTIC_REGRESSION_PATH: logistic_regression_id = get_logistic_regression_id(model) check_resource(logistic_regression_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A model or ensemble id is needed to create a" " prediction. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) if model_id is not None: create_args.update({ "model": model_id}) elif ensemble_id is not None: create_args.update({ "ensemble": ensemble_id}) elif logistic_regression_id is not None: create_args.update({ "logisticregression": logistic_regression_id}) body = json.dumps(create_args) return self._create(self.prediction_url, body, verify=self.verify_prediction)
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields( {'resource': resource['resource'], 'object': resource}) referrer_fields = Fields( {'resource': referrer['resource'], 'object': referrer}) input_fields = [resource_fields.field_name(field_id) for field_id in input_fields_ids] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name( objective_id) referrer_input_fields.append([name for name in referrer_fields_names if name != referrer_objective]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def create_execution(self, origin_resource, args=None, wait_time=3, retries=10): """Creates an execution from a `script` or a list of `scripts`. """ create_args = {} if args is not None: create_args.update(args) if (isinstance(origin_resource, basestring) or isinstance(origin_resource, dict)): # single script scripts = [origin_resource] else: scripts = origin_resource try: script_ids = [get_script_id(script) for script in scripts] except TypeError: raise Exception("A script id or a list of them is needed to create" " a script execution. %s found." % get_resource_type(origin_resource)) if all([get_resource_type(script_id) == SCRIPT_PATH for script_id in script_ids]): for script in scripts: check_resource(script, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A script id or a list of them is needed to create" " a script execution. %s found." % get_resource_type(origin_resource)) if len(scripts) > 1: create_args.update({ "scripts": script_ids}) else: create_args.update({ "script": script_ids[0]}) body = json.dumps(create_args) return self._create(self.execution_url, body)
def create_model(self, origin_resource, args=None, wait_time=3, retries=10): """Creates a model from an origin_resource. Uses a remote resource to create a new model using the arguments in `args`. The allowed remote resources can be: - dataset - list of datasets - cluster In the case of using cluster id as origin_resource, a centroid must also be provided in the args argument. The first centroid is used otherwise. """ create_args = {} if args is not None: create_args.update(args) if isinstance(origin_resource, list): # mutidatasets create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries) else: resource_type = get_resource_type(origin_resource) # model from cluster and centroid if resource_type == CLUSTER_PATH: cluster_id = get_cluster_id(origin_resource) cluster = check_resource(cluster_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if 'centroid' not in create_args: try: centroid = cluster['object'][ 'cluster_models'].keys()[0] create_args.update({'centroid': centroid}) except KeyError: raise KeyError("Failed to generate the model. A " "centroid id is needed in the args " "argument to generate a model from " "a cluster.") create_args.update({'cluster': cluster_id}) elif resource_type == DATASET_PATH: create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries) else: raise Exception("A dataset, list of dataset ids" " or cluster id plus centroid id are needed" " to create a" " dataset. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.model_url, body)
def create_model(self, origin_resource, args=None, wait_time=3, retries=10): """Creates a model from an origin_resource. Uses a remote resource to create a new model using the arguments in `args`. The allowed remote resources can be: - dataset - list of datasets - cluster In the case of using cluster id as origin_resource, a centroid must also be provided in the args argument. The first centroid is used otherwise. """ create_args = {} if args is not None: create_args.update(args) if isinstance(origin_resource, list): # mutidatasets create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries) else: resource_type = get_resource_type(origin_resource) # model from cluster and centroid if resource_type == CLUSTER_PATH: cluster_id = get_cluster_id(origin_resource) cluster = check_resource(cluster_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if not 'centroid' in create_args: try: centroid = cluster['object'][ 'cluster_models'].keys()[0] create_args.update({'centroid': centroid}) except KeyError: raise KeyError("Failed to generate the model. A " "centroid id is needed in the args " "argument to generate a model from " "a cluster.") create_args.update({'cluster': cluster_id}) elif resource_type == DATASET_PATH: create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries) else: raise Exception("A dataset, list of dataset ids" " or cluster id plus centroid id are needed" " to create a" " dataset. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.model_url, body)
def get_fields_changes(resource, referrer=None, updatable_attrs=DEFAULT_UPDATABLE): """Changed field attributes """ if referrer is None: referrer = {} fields_attributes = {} resource_fields = Fields( {'resource': resource['resource'], 'object': resource}).fields resource_type = get_resource_type(resource) # for sources, extract all the updatable attributes if get_resource_type(resource) == 'source': updatable_attrs = SOURCE_UPDATABLE for field_id in resource_fields.keys(): field_opts = {} field = resource_fields[field_id] for attribute in updatable_attrs: if field.get(attribute): field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes # for the rest of resources, check which attributes changed if referrer: referrer_fields = Fields( {'resource': referrer['resource'], 'object': referrer}).fields for field_id in resource_fields.keys(): field_opts = {} if not field_id in referrer_fields.keys(): continue field = resource_fields[field_id] for attribute in updatable_attrs: ref_values = ["", referrer_fields[field_id].get(attribute, "")] if not field.get(attribute, "") in ref_values: field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes
def fields_map_options(resource, referrer1, referrer2, opts, call="create"): """Stores the fields_map option if needed """ # model to dataset mapping resource_type = get_resource_type(referrer1['resource']) if resource_type == 'model': fields = referrer1['model']['model_fields'] else: fields = referrer2['fields'].keys() default_map = dict(zip(fields, fields)) opts[call].update(default_setting(resource, 'fields_map', default_map))
def create_script(self, source_code=None, args=None, wait_time=3, retries=10): """Creates a whizzml script from its source code. The `source_code` parameter can be a: {script ID}: the ID for an existing whizzml script {path}: the path to a file containing the source code {string} : the string containing the source code for the script """ create_args = {} if args is not None: create_args.update(args) if source_code is None: raise Exception('A valid code string' ' or a script id must be provided.') resource_type = get_resource_type(source_code) if resource_type == SCRIPT_PATH: script_id = get_script_id(source_code) if script_id: check_resource(script_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) create_args.update({"origin": script_id}) elif isinstance(source_code, basestring): if is_url(source_code): script_args = retrieve_script_args(source_code) source_code = script_args.get("source_code") create_args.update(json.loads(script_args.get("json"))) else: try: if os.path.exists(source_code): with open(source_code) as code_file: source_code = code_file.read() except IOError: raise IOError("Could not open the source code file %s." % source_code) create_args.update({"source_code": source_code}) else: raise Exception("A script id or a valid source code" " is needed to create a" " script. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.script_url, body)
def fields_map_options(resource, referrer1, referrer2, opts, call="create"): """Stores the fields_map option if needed """ # model to dataset mapping resource_type = get_resource_type(referrer1['resource']) if resource_type == 'model': fields = referrer1['model']['model_fields'] else: fields = referrer2['fields'].keys() default_map = dict(zip(fields, fields)) opts[call].update( default_setting(resource, 'fields_map', default_map))
def get_resource_alias(resource_id, counts, alias): """Creates a human-friendly alias for the resource """ if alias.get(resource_id): return alias.get(resource_id) else: resource_type = get_resource_type(resource_id) if resource_type in counts: counts[resource_type] += 1 else: counts[resource_type] = 1 new_alias = "%s%s" % (resource_type, counts[resource_type]) alias[resource_id] = new_alias return new_alias
def reify_resource(self, resource_id): """Redirects to the reify method according to the resource type """ # first check if this is a valid id resource_id = get_resource_id(resource_id) if resource_id is not None: resource_type = get_resource_type(resource_id) reify_handler = getattr(self, 'reify_%s' % resource_type) message = "Analyzing %s.\n" % resource_id self.logger(message) reify_handler(resource_id) if self.delete: self.delete_stored_resource(resource_id)
def create_prediction(self, model, input_data=None, args=None, wait_time=3, retries=10): """Creates a new prediction. The model parameter can be: - a simple tree model - a simple logistic regression model - an ensemble - a deepnet . a linear regression - a fusion Note that the old `by_name` argument has been deprecated. """ model_id = None resource_type = get_resource_type(model) if resource_type not in SUPERVISED_PATHS: raise Exception("A supervised model resource id is needed" " to create a prediction. %s found." % resource_type) model_id = get_resource_id(model) if model_id is not None: check_resource(model_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({"input_data": input_data}) if model_id is not None: create_args.update({"model": model_id}) body = json.dumps(create_args) return self._create(self.prediction_url, body, verify=self.verify_prediction)
def create_library(self, source_code=None, args=None, wait_time=3, retries=10): """Creates a whizzml library from its source code. The `source_code` parameter can be a: {library ID}: the ID for an existing whizzml library {path}: the path to a file containing the source code {string} : the string containing the source code for the library """ create_args = {} if args is not None: create_args.update(args) if source_code is None: raise Exception('A valid code string' ' or a library id must be provided.') resource_type = get_resource_type(source_code) if resource_type == LIBRARY_PATH: library_id = get_library_id(source_code) if library_id: check_resource(library_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) create_args.update({"origin": library_id}) elif isinstance(source_code, basestring): try: if os.path.exists(source_code): with open(source_code) as code_file: source_code = code_file.read() except IOError: raise IOError("Could not open the source code file %s." % source_code) create_args.update({"source_code": source_code}) else: raise Exception("A library id or a valid source code" " is needed to create a" " library. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.library_url, body)
def reify_python(self, alias=None): """REST call command line in python. See ``reify`` method. """ def resource_alias(resource_id): """Returns the alias if found """ if isinstance(resource_id, basestring): return alias.get(resource_id, '"%s"' % resource_id) elif isinstance(resource_id, list): alias_names = [] for resource_id_id in resource_id: alias_names.append( alias.get(resource_id_id, '"%s"' % resource_id_id)) return repr(alias_names) resource_type = get_resource_type(self.resource_id) resource_name = resource_alias(self.resource_id) resource_method_suffix = RENAMED_RESOURCES.get( resource_type, resource_type) origin_names = [resource_alias(resource_id) for resource_id in self.origins] arguments = ", ".join(origin_names) if self.suffix: arguments = "%s%s" % (arguments, self.suffix) if self.input_data: arguments = "%s, \\\n%s%s" % ( \ arguments, INDENT, pprint.pformat(self.input_data).replace("\n", "\n%s" % INDENT)) if self.args: sort_lists(self.args) arguments = "%s, \\\n%s%s" % (arguments, \ INDENT, \ pprint.pformat(self.args).replace( \ "\n", "\n%s" % INDENT)) out = "%s = api.%s_%s(%s)\napi.ok(%s)\n\n" % ( resource_name, self.action, resource_method_suffix, arguments, resource_name) return out
def create_library(self, source_code=None, args=None, wait_time=3, retries=10): """Creates a whizzml library from its source code. The `source_code` parameter can be a: {library ID}: the ID for an existing whizzml library {path}: the path to a file containing the source code {string} : the string containing the source code for the library """ create_args = {} if args is not None: create_args.update(args) if source_code is None: raise Exception('A valid code string' ' or a library id must be provided.') resource_type = get_resource_type(source_code) if resource_type == LIBRARY_PATH: library_id = get_library_id(source_code) if library_id: check_resource(library_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) create_args.update({ "origin": library_id}) elif isinstance(source_code, basestring): try: if os.path.exists(source_code): with open(source_code) as code_file: source_code = code_file.read() except IOError: raise IOError("Could not open the source code file %s." % source_code) create_args.update({ "source_code": source_code}) else: raise Exception("A library id or a valid source code" " is needed to create a" " library. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.library_url, body)
def reify_source(self, resource_id): """Extracts the REST API arguments from the source JSON structure """ resource_type = get_resource_type(resource_id) child = self.get_resource(resource_id) opts = {"create": {}, "update": {}} # create options source_defaults = DEFAULTS[resource_type].get("create", {}) source_defaults.update(COMMON_DEFAULTS.get("create", {})) # special case, souces can be named like uploaded files name_as_file = [child.get('file_name')] name_as_file.extend(source_defaults["name"]) source_defaults["name"] = name_as_file for attribute, default_value in source_defaults.items(): opts["create"].update( u.default_setting(child, attribute, *default_value)) # data if child.get('remote') is not None: data = child['remote'] elif child.get('file_name') is not None: data = child['file_name'] else: data = "UNKNOWN-INLINE-DATA" # update options source_defaults = DEFAULTS[resource_type].get("update", {}) for attribute, default_value in source_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # We add the information for the updatable fields only when requested. if self.add_fields: opts["update"].update({"fields": u.get_fields_changes(child)}) calls = u.build_calls(resource_id, [data], opts) self.add(resource_id, calls)
def create_prediction(self, model, input_data=None, args=None, wait_time=3, retries=10): """Creates a new prediction. The model parameter can be: - a simple tree model - a simple logistic regression model - an ensemble - a deepnet . a linear regression - a fusion Note that the old `by_name` argument has been deprecated. """ model_id = None resource_type = get_resource_type(model) if resource_type not in SUPERVISED_PATHS: raise Exception("A supervised model resource id is needed" " to create a prediction. %s found." % resource_type) model_id = get_resource_id(model) if model_id is not None: check_resource(model_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data}) if model_id is not None: create_args.update({ "model": model_id}) body = json.dumps(create_args) return self._create(self.prediction_url, body, verify=self.verify_prediction)
def get_fields(resource): """Returns the field information in a resource dictionary structure """ try: resource_type = get_resource_type(resource) except ValueError: raise ValueError("Unknown resource structure. Failed to find" " a valid resource dictionary as argument.") if resource_type in RESOURCES_WITH_FIELDS: resource = resource.get('object', resource) # fields structure if resource_type in FIELDS_PARENT.keys(): fields = resource[FIELDS_PARENT[resource_type]].get('fields', {}) else: fields = resource.get('fields', {}) if resource_type == SAMPLE_PATH: fields = dict([(field['id'], field) for field in fields]) return fields
def get_origin_info(resource): """Key and value that stores the origin resource id """ resource_type = get_resource_type(resource) origins = ORIGINS.get(resource_type, []) found_origins = [] for argument_origins in origins: for origin in argument_origins: info = resource.get(origin) if info: if origin == 'ranges': info = info.keys() found_origins.append((origin, info)) break if not found_origins: sys.exit("Failed to find the complete origin information.") if len(found_origins) == 1: return found_origins[0] else: return found_origins
def create_projection(self, pca, input_data=None, args=None, wait_time=3, retries=10): """Creates a new projection. The pca parameter can be a pca resource or ID """ pca_id = None resource_type = get_resource_type(pca) if resource_type != PCA_PATH: raise Exception("A PCA resource id is needed" " to create a projection. %s found." % resource_type) pca_id = get_resource_id(pca) if pca_id is not None: check_resource(pca_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({"input_data": input_data}) if pca_id is not None: create_args.update({"pca": pca_id}) body = json.dumps(create_args) return self._create(self.projection_url, body, verify=self.verify)
def create_statistical_test(self, dataset, args=None, wait_time=3, retries=10): """Creates a statistical test from a `dataset`. """ dataset_id = None resource_type = get_resource_type(dataset) if resource_type == DATASET_PATH: dataset_id = get_dataset_id(dataset) check_resource(dataset_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: raise Exception("A dataset id is needed to create a" " statistical test. %s found." % resource_type) create_args = {} if args is not None: create_args.update(args) create_args.update({ "dataset": dataset_id}) body = json.dumps(create_args) return self._create(self.statistical_test_url, body)
def create_topic_distribution(self, topic_model, input_data=None, args=None, wait_time=3, retries=10): """Creates a new topic distribution. """ topic_model_id = get_topic_model_id(topic_model) if topic_model_id is not None: check_resource(topic_model_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) else: resource_type = get_resource_type(topic_model) raise Exception("A topic model id is needed to create a" " topic distribution. %s found." % resource_type) if input_data is None: input_data = {} create_args = {} if args is not None: create_args.update(args) create_args.update({ "input_data": input_data, "topicmodel": topic_model_id }) body = json.dumps(create_args) return self._create(self.topic_distribution_url, body, verify=self.verify_prediction)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}, "get": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score grandparent = parent if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) grandparents = u.get_origin_info(parent) # batch resources have two parents, choose the dataset if origin == "origin_batch_resource" and \ isinstance(grandparents, list): for gp_origin, grandparent in grandparents: if gp_origin == "dataset": break else: _, grandparent = grandparents grandparent = self.get_resource(grandparent) # options common to all model types call = "update" if origin == "origin_batch_resource" else "create" u.common_dataset_opts(child, grandparent, opts, call=call) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({ 'resource': child['resource'], 'object': child }) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() # if there's no preferred fields, use the fields structure if len(preferred_fields.keys()) == 0: preferred_fields = resource_fields.fields max_column = sorted([ field['column_number'] for _, field in preferred_fields.items() if field['optype'] != "text" ], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) if origin != "origin_batch_resource": # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) # for batch_predictions, batch_clusters, batch_anomalies generated # datasets, attributes cannot be set at creation time, so we # must update the resource instead suffix = None if origin == "origin_batch_resource": opts["update"].update(opts["create"]) opts["create"] = {} suffix = "['object']['output_dataset_resource']" calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix) self.add(resource_id, calls)
def create_dataset(self, origin_resource, args=None, wait_time=3, retries=10): """Creates a remote dataset. Uses a remote resource to create a new dataset using the arguments in `args`. The allowed remote resources can be: - source - dataset - list of datasets - cluster In the case of using cluster id as origin_resources, a centroid must also be provided in the args argument. The first centroid is used otherwise. If `wait_time` is higher than 0 then the dataset creation request is not sent until the `source` has been created successfuly. """ create_args = {} if args is not None: create_args.update(args) if isinstance(origin_resource, list): # mutidatasets create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries, key="origin_datasets") else: # dataset from source resource_type = get_resource_type(origin_resource) if resource_type == SOURCE_PATH: source_id = get_source_id(origin_resource) if source_id: check_resource(source_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) create_args.update({ "source": source_id}) # dataset from dataset elif resource_type == DATASET_PATH: create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries, key="origin_dataset") # dataset from cluster and centroid elif resource_type == CLUSTER_PATH: cluster_id = get_cluster_id(origin_resource) cluster = check_resource(cluster_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if 'centroid' not in create_args: try: centroid = cluster['object'][ 'cluster_datasets_ids'].keys()[0] create_args.update({'centroid': centroid}) except KeyError: raise KeyError("Failed to generate the dataset. A " "centroid id is needed in the args " "argument to generate a dataset from " "a cluster.") create_args.update({'cluster': cluster_id}) else: raise Exception("A source, dataset, list of dataset ids" " or cluster id plus centroid id are needed" " to create a" " dataset. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.dataset_url, body)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score if origin in ["origin_batch_resource", "cluster"]: if origin == "cluster": opts["create"].update({"centroid": child["centroid"]}) _, grandparent = u.get_origin_info(parent) grandparent = self.get_resource(grandparent) else: grandparent = parent # options common to all model types u.common_dataset_opts(child, grandparent, opts) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) dataset_defaults.update(COMMON_DEFAULTS.get("update", {})) for attribute, default_value in dataset_defaults.items(): opts["update"].update(u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u""] suffixes = [ u"filtered", u"sampled", u"dataset", u"extended", u"- batchprediction", u"- batchanomalyscore", u"- batchcentroid", u"- merged", ] autonames.extend([u"%s %s" % (grandparent.get("name", ""), suffix) for suffix in suffixes]) autonames.append(u"%s's dataset" % ".".join(parent["name"].split(".")[0:-1])) autonames.append(u"%s' dataset" % ".".join(parent["name"].split(".")[0:-1])) autonames.append(u"Cluster %s - %s" % (int(child.get("centroid", "0"), base=16), parent["name"])) autonames.append(u"Dataset from %s model - segment" % parent["name"]) u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({"resource": child["resource"], "object": child}) objective_id = child["objective_field"]["id"] preferred_fields = resource_fields.preferred_fields() max_column = sorted([field["column_number"] for _, field in preferred_fields.items()], reverse=True)[0] objective_column = resource_fields.fields[objective_id]["column_number"] if objective_column != max_column: opts["create"].update({"objective_field": {"id": objective_id}}) # resize if child["size"] != grandparent["size"] and get_resource_type(parent) == "source": opts["create"].update({"size": child["size"]}) # generated fields if child.get("new_fields", None): new_fields = child["new_fields"] for new_field in new_fields: new_field["field"] = new_field["generator"] del new_field["generator"] opts["create"].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) calls = u.build_calls(resource_id, [parent_id], opts) self.add(resource_id, calls)
def create_dataset(self, origin_resource, args=None, wait_time=3, retries=10): """Creates a remote dataset. Uses a remote resource to create a new dataset using the arguments in `args`. The allowed remote resources can be: - source - dataset - list of datasets - cluster In the case of using cluster id as origin_resources, a centroid must also be provided in the args argument. The first centroid is used otherwise. If `wait_time` is higher than 0 then the dataset creation request is not sent until the `source` has been created successfuly. """ create_args = {} if args is not None: create_args.update(args) if isinstance(origin_resource, list): # mutidatasets create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries, key="origin_datasets") else: # dataset from source resource_type = get_resource_type(origin_resource) if resource_type == SOURCE_PATH: source_id = get_source_id(origin_resource) if source_id: check_resource(source_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) create_args.update({"source": source_id}) # dataset from dataset elif resource_type == DATASET_PATH: create_args = self._set_create_from_datasets_args( origin_resource, args=create_args, wait_time=wait_time, retries=retries, key="origin_dataset") # dataset from cluster and centroid elif resource_type == CLUSTER_PATH: cluster_id = get_cluster_id(origin_resource) cluster = check_resource(cluster_id, query_string=TINY_RESOURCE, wait_time=wait_time, retries=retries, raise_on_error=True, api=self) if not 'centroid' in create_args: try: centroid = cluster['object'][ 'cluster_datasets_ids'].keys()[0] create_args.update({'centroid': centroid}) except KeyError: raise KeyError("Failed to generate the dataset. A " "centroid id is needed in the args " "argument to generate a dataset from " "a cluster.") create_args.update({'cluster': cluster_id}) else: raise Exception("A source, dataset, list of dataset ids" " or cluster id plus centroid id are needed" " to create a" " dataset. %s found." % resource_type) body = json.dumps(create_args) return self._create(self.dataset_url, body)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}, "get": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score grandparent = parent if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) grandparents = u.get_origin_info(parent) # batch resources have two parents, choose the dataset if origin == "origin_batch_resource" and \ isinstance(grandparents, list): for gp_origin, grandparent in grandparents: if gp_origin == "dataset": break else: _, grandparent = grandparents grandparent = self.get_resource(grandparent) # options common to all model types call = "update" if origin == "origin_batch_resource" else "create" u.common_dataset_opts(child, grandparent, opts, call=call) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields( {'resource': child['resource'], 'object': child}) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() # if there's no preferred fields, use the fields structure if len(preferred_fields.keys()) == 0: preferred_fields = resource_fields.fields max_column = sorted([field['column_number'] for _, field in preferred_fields.items() if field['optype'] != "text"], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) if origin != "origin_batch_resource": # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) # for batch_predictions, batch_clusters, batch_anomalies generated # datasets, attributes cannot be set at creation time, so we # must update the resource instead suffix = None if origin == "origin_batch_resource": opts["update"].update(opts["create"]) opts["create"] = {} suffix = "['object']['output_dataset_resource']" calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix) self.add(resource_id, calls)