Example #1
0
def get_fields_structure(resource, errors=False):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")
    field_errors = None
    if resource_type in RESOURCES_WITH_FIELDS:
        resource = resource.get('object', resource)
        # locale and missing tokens
        if resource_type == SOURCE_PATH:
            resource_locale = resource['source_parser']['locale']
            missing_tokens = resource[
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource.get('locale', DEFAULT_LOCALE)
            missing_tokens = resource.get('missing_tokens',
                                          DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in [MODEL_PATH, ANOMALY_PATH]:
            fields = resource['model']['fields']
        elif resource_type == CLUSTER_PATH:
            fields = resource['clusters']['fields']
        elif resource_type == CORRELATION_PATH:
            fields = resource['correlations']['fields']
        elif resource_type == STATISTICAL_TEST_PATH:
            fields = resource['statistical_tests']['fields']
        elif resource_type == LOGISTIC_REGRESSION_PATH:
            fields = resource['logistic_regression']['fields']
        elif resource_type == ASSOCIATION_PATH:
            fields = resource['associations']['fields']
        elif resource_type == TOPIC_MODEL_PATH:
            fields = resource['topic_model']['fields']
        elif resource_type == SAMPLE_PATH:
            fields = dict([(field['id'], field) for field in
                           resource['sample']['fields']])
        else:
            fields = resource['fields']
        # Check whether there's an objective id
        objective_column = None
        if resource_type == DATASET_PATH:
            objective_column = resource.get( \
                'objective_field', {}).get('id')
            if errors:
                field_errors = resource.get("status", {}).get("field_errors")
        elif resource_type in [MODEL_PATH, LOGISTIC_REGRESSION_PATH]:
            objective_id = resource.get( \
                'objective_fields', [None])[0]
            objective_column = fields.get( \
                objective_id, {}).get('column_number')
        result = fields, resource_locale, missing_tokens, objective_column
        if errors:
            result = result + (field_errors,)
        return result
    else:
        return (None, None, None, None, None) if errors else \
            (None, None, None, None)
Example #2
0
def get_fields_structure(resource):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")

    if resource_type in RESOURCES_WITH_FIELDS:
        # locale and missing tokens
        if resource_type == SOURCE_TYPE:
            resource_locale = resource['object']['source_parser']['locale']
            missing_tokens = resource['object'][
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource['object'].get('locale', DEFAULT_LOCALE)
            missing_tokens = resource['object'].get('missing_tokens',
                                                    DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in [MODEL_TYPE, ANOMALY_TYPE]:
            fields = resource['object']['model']['fields']
        elif resource_type == CLUSTER_TYPE:
            fields = resource['object']['clusters']['fields']
        else:
            fields = resource['object']['fields']
        return fields, resource_locale, missing_tokens
    else:
        return None, None, None
Example #3
0
def get_fields_structure(resource):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")

    if resource_type in RESOURCES_WITH_FIELDS:
        if resource_type == SOURCE_TYPE:
            resource_locale = resource['object']['source_parser']['locale']
            missing_tokens = resource['object'][
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource['object']['locale']
            missing_tokens = resource['object']['missing_tokens']
        if resource_type == MODEL_TYPE:
            fields = resource['object']['model']['fields']
        else:
            fields = resource['object']['fields']
        return fields, resource_locale, missing_tokens
    else:
        return None, None, None
Example #4
0
def retrieve_resource(api,
                      resource_id,
                      query_string=ONLY_MODEL,
                      no_check_fields=False):
    """ Retrieves resource info either from a local repo or
        from the remote server

    """
    if api.storage is not None:
        try:
            stored_resource = "%s%s%s" % (api.storage, os.sep,
                                          resource_id.replace("/", "_"))
            with open(stored_resource) as resource_file:
                resource = json.loads(resource_file.read())
            # we check that the stored resource has enough fields information
            # for local predictions to work. Otherwise we should retrieve it.
            if no_check_fields or check_model_fields(resource):
                return resource
        except ValueError:
            raise ValueError("The file %s contains no JSON")
        except IOError:
            pass
    if api.auth == '?username=;api_key=;':
        raise ValueError("The credentials information is missing. This"
                         " information is needed to download resource %s"
                         " for the first time and store it locally for further"
                         " use. Please export BIGML_USERNAME"
                         " and BIGML_API_KEY." % resource_id)
    api_getter = api.getters[get_resource_type(resource_id)]
    resource = check_resource(resource_id, api_getter, query_string)
    return resource
Example #5
0
def retrieve_resource(api,
                      resource_id,
                      query_string='limit=-1;',
                      no_check_fields=False):
    """ Retrieves resource info either from a local repo or
        from the remote server

    """
    if api.storage is not None:
        try:
            stored_resource = "%s%s%s" % (api.storage, os.sep,
                                          resource_id.replace("/", "_"))
            with open(stored_resource) as resource_file:
                resource = json.loads(resource_file.read())
            # we check that the stored resource has enough fields information
            # for local predictions to work. Otherwise we should retrieve it.
            if no_check_fields or check_model_fields(resource):
                return resource
        except ValueError:
            raise ValueError("The file %s contains no JSON")
        except IOError:
            pass
    api_getter = api.getters[get_resource_type(resource_id)]
    resource = check_resource(resource_id, api_getter, query_string)
    return resource
Example #6
0
def get_fields_structure(resource):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")

    if resource_type in RESOURCES_WITH_FIELDS:
        # locale and missing tokens
        if resource_type == SOURCE_TYPE:
            resource_locale = resource['object']['source_parser']['locale']
            missing_tokens = resource['object'][
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource['object'].get('locale', DEFAULT_LOCALE)
            missing_tokens = resource['object'].get('missing_tokens',
                                                    DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in [MODEL_TYPE, ANOMALY_TYPE]:
            fields = resource['object']['model']['fields']
        elif resource_type == CLUSTER_TYPE:
            fields = resource['object']['clusters']['fields']
        elif resource_type == SAMPLE_TYPE:
            fields = dict([(field['id'], field) for field in
                          resource['object']['sample']['fields']])
        else:
            fields = resource['object']['fields']
        return fields, resource_locale, missing_tokens
    else:
        return None, None, None
Example #7
0
    def __init__(self, model, api=None):

        resource_id, model = extract_id(model)
        resource_type = get_resource_type(resource_id)
        local_model = COMPONENT_CLASSES[resource_type](model, api=api)
        self.__class__.__bases__ = local_model.__class__.__bases__
        for attr, value in local_model.__dict__.items():
            setattr(self, attr, value)
        self.local_model = local_model
Example #8
0
    def __init__(self, resource_or_fields, missing_tokens=None,
                 data_locale=None, verbose=False,
                 objective_field=None, objective_field_present=False,
                 include=None, errors=None):

        # The constructor can be instantiated with resources or a fields
        # structure. The structure is checked and fields structure is returned
        # if a resource type is matched.
        try:
            self.resource_type = get_resource_type(resource_or_fields)
            resource_info = get_fields_structure(resource_or_fields, True)
            (self.fields,
             resource_locale,
             resource_missing_tokens,
             objective_column,
             resource_errors) = resource_info
            if data_locale is None:
                data_locale = resource_locale
            if missing_tokens is None:
                if resource_missing_tokens:
                    missing_tokens = resource_missing_tokens
            if errors is None:
                errors = resource_errors
        except ValueError:
            # If the resource structure is not in the expected set, fields
            # structure is assumed
            self.fields = resource_or_fields
            if data_locale is None:
                data_locale = DEFAULT_LOCALE
            if missing_tokens is None:
                missing_tokens = DEFAULT_MISSING_TOKENS
            objective_column = None
        if self.fields is None:
            raise ValueError("No fields structure was found.")
        self.fields_by_name = invert_dictionary(self.fields, 'name')
        self.fields_by_column_number = invert_dictionary(self.fields,
                                                         'column_number')
        find_locale(data_locale, verbose)
        self.missing_tokens = missing_tokens
        self.fields_columns = sorted(self.fields_by_column_number.keys())
        # Ids of the fields to be included
        self.filtered_fields = (list(self.fields.keys()) if include is None
                                else include)
        # To be updated in update_objective_field
        self.row_ids = None
        self.headers = None
        self.objective_field = None
        self.objective_field_present = None
        self.filtered_indexes = None
        self.field_errors = errors
        # if the objective field is not set by the user
        # use the one extracted from the resource info
        if objective_field is None and objective_column is not None:
            objective_field = objective_column
            objective_field_present = True
        self.update_objective_field(objective_field, objective_field_present)
Example #9
0
def share_resource(api, resource):
    """Creates a secret link to share the resource.

    """
    resource_type = get_resource_type(resource)
    resource = get_updater(api, resource_type)(resource, {"shared": True})
    if api.ok(resource) and is_shared(resource):
        return ("https://bigml.com/shared/%s/%s" %
                (resource_type, resource['object']['shared_hash']))
    else:
        sys.exit("Failed to share the resource: %s" % resource['resource'])
Example #10
0
def share_resource(api, resource):
    """Creates a secret link to share the resource.

    """
    resource_type = get_resource_type(resource)
    resource = get_updater(api, resource_type)(resource, {"shared": True})
    if api.ok(resource) and is_shared(resource):
        return ("https://bigml.com/shared/%s/%s" %
                (resource_type, resource['object']['shared_hash']))
    else:
        sys.exit("Failed to share the resource: %s" % resource['resource'])
Example #11
0
    def __init__(self, model, api=None):

        self.api = get_api_connection(api)
        resource_id, model = extract_id(model, api)
        resource_type = get_resource_type(resource_id)
        kwargs = {"api": self.api}
        local_model = COMPONENT_CLASSES[resource_type](model, **kwargs)
        self.__class__.__bases__ = local_model.__class__.__bases__
        for attr, value in local_model.__dict__.items():
            setattr(self, attr, value)
        self.local_model = local_model
Example #12
0
    def __init__(self, model, api=None):

        if api is None:
            api = BigML(storage=STORAGE)
        resource_id, model = extract_id(model, api)
        resource_type = get_resource_type(resource_id)
        kwargs = {"api": api}
        local_model = COMPONENT_CLASSES[resource_type](model, **kwargs)
        self.__class__.__bases__ = local_model.__class__.__bases__
        for attr, value in local_model.__dict__.items():
            setattr(self, attr, value)
        self.local_model = local_model
Example #13
0
    def __init__(self, model, api=None):

        if api is None:
            api = BigML(storage=STORAGE)
        resource_id, model = extract_id(model, api)
        resource_type = get_resource_type(resource_id)
        kwargs = {"api": api}
        local_model = COMPONENT_CLASSES[resource_type](model, **kwargs)
        self.__class__.__bases__ = local_model.__class__.__bases__
        for attr, value in local_model.__dict__.items():
            setattr(self, attr, value)
        self.local_model = local_model
Example #14
0
def get_fields_structure(resource, errors=False):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")
    field_errors = None
    if resource_type in RESOURCES_WITH_FIELDS:
        resource = resource.get('object', resource)
        # locale and missing tokens
        if resource_type == SOURCE_PATH:
            resource_locale = resource['source_parser']['locale']
            missing_tokens = resource[
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource.get('locale', DEFAULT_LOCALE)
            missing_tokens = resource.get('missing_tokens',
                                          DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in FIELDS_PARENT.keys():
            fields = resource[FIELDS_PARENT[resource_type]].get('fields', {})
        else:
            fields = resource.get('fields', {})

        if resource_type == SAMPLE_PATH:
            fields = dict([(field['id'], field) for field in
                           fields])
        # Check whether there's an objective id
        objective_column = None
        if resource_type == DATASET_PATH:
            objective_column = resource.get( \
                'objective_field', {}).get('id')
            if errors:
                field_errors = resource.get("status", {}).get("field_errors")
        elif resource_type in [MODEL_PATH, LOGISTIC_REGRESSION_PATH]:
            objective_id = resource.get( \
                'objective_fields', [None])[0]
            objective_column = fields.get( \
                objective_id, {}).get('column_number')
        result = fields, resource_locale, missing_tokens, objective_column
        if errors:
            result = result + (field_errors,)
        return result
    else:
        return (None, None, None, None, None) if errors else \
            (None, None, None, None)
Example #15
0
def get_fields_structure(resource, errors=False):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")
    field_errors = None
    if resource_type in RESOURCES_WITH_FIELDS:
        resource = resource.get('object', resource)
        # locale and missing tokens
        if resource_type == SOURCE_PATH:
            resource_locale = resource['source_parser']['locale']
            missing_tokens = resource[
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource.get('locale', DEFAULT_LOCALE)
            missing_tokens = resource.get('missing_tokens',
                                          DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in FIELDS_PARENT.keys():
            fields = resource[FIELDS_PARENT[resource_type]].get('fields', {})
        else:
            fields = resource.get('fields', {})

        if resource_type == SAMPLE_PATH:
            fields = dict([(field['id'], field) for field in
                           fields])
        # Check whether there's an objective id
        objective_column = None
        if resource_type == DATASET_PATH:
            objective_column = resource.get( \
                'objective_field', {}).get('id')
            if errors:
                field_errors = resource.get("status", {}).get("field_errors")
        elif resource_type in [MODEL_PATH, LOGISTIC_REGRESSION_PATH]:
            objective_id = resource.get( \
                'objective_fields', [None])[0]
            objective_column = fields.get( \
                objective_id, {}).get('column_number')
        result = fields, resource_locale, missing_tokens, objective_column
        if errors:
            result = result + (field_errors,)
        return result
    else:
        return (None, None, None, None, None) if errors else \
            (None, None, None, None)
Example #16
0
def get_fields_structure(resource):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")

    if resource_type in RESOURCES_WITH_FIELDS:
        # locale and missing tokens
        if resource_type == SOURCE_TYPE:
            resource_locale = resource['object']['source_parser']['locale']
            missing_tokens = resource['object'][
                'source_parser']['missing_tokens']
        else:
            resource_locale = resource['object'].get('locale', DEFAULT_LOCALE)
            missing_tokens = resource['object'].get('missing_tokens',
                                                    DEFAULT_MISSING_TOKENS)
        # fields structure
        if resource_type in [MODEL_TYPE, ANOMALY_TYPE]:
            fields = resource['object']['model']['fields']
        elif resource_type == CLUSTER_TYPE:
            fields = resource['object']['clusters']['fields']
        elif resource_type == CORRELATION_TYPE:
            fields = resource['object']['correlations']['fields']
        elif resource_type == STATISTICAL_TEST_TYPE:
            fields = resource['object']['statistical_tests']['fields']
        elif resource_type == SAMPLE_TYPE:
            fields = dict([(field['id'], field) for field in
                           resource['object']['sample']['fields']])
        else:
            fields = resource['object']['fields']
        # Check whether there's an objective id
        objective_column = None
        if resource_type == DATASET_TYPE:
            objective_column = resource['object'].get( \
                'objective_field', {}).get('id')
        elif resource_type == MODEL_TYPE:
            objective_id = resource['object'].get( \
                'objective_fields', [None])[0]
            objective_column = fields.get( \
                objective_id, {}).get('column_number')
        return fields, resource_locale, missing_tokens, objective_column
    else:
        return None, None, None, None
Example #17
0
def retrieve_resource(api, resource_id, query_string=""):
    """ Retrieves resource info either from a local repo or
        from the remote server

    """
    if api.storage is not None:
        try:
            stored_resource = "%s%s%s" % (api.storage, os.sep, resource_id.replace("/", "_"))
            with open(stored_resource) as resource_file:
                resource = json.loads(resource_file.read())
            return resource
        except ValueError:
            raise ValueError("The file %s contains no JSON")
        except IOError:
            pass
    api_getter = api.getters[get_resource_type(resource_id)]
    resource = check_resource(resource_id, api_getter, query_string)
    return resource
Example #18
0
def retrieve_resource(api, resource_id, query_string=''):
    """ Retrieves resource info either from a local repo or
        from the remote server

    """
    if api.storage is not None:
        try:
            stored_resource = "%s%s%s" % (api.storage, os.sep,
                                          resource_id.replace("/", "_"))
            with open(stored_resource) as resource_file:
                resource = json.loads(resource_file.read())
            return resource
        except ValueError:
            raise ValueError("The file %s contains no JSON")
        except IOError:
            pass
    api_getter = api.getters[get_resource_type(resource_id)]
    resource = check_resource(resource_id, api_getter, query_string)
    return resource
Example #19
0
def delete(object_list):
    """Deletes the objects in object_list using the api delete method

    """

    for obj_id in object_list:
        counter = 0
        delete_method = world.api.deleters[get_resource_type(obj_id)]
        result = delete_method(obj_id)
        while result['code'] != HTTP_NO_CONTENT and counter < MAX_RETRIES:
            print ("Failed to delete %s with code %s. Retrying." %
                   (obj_id, result['code']))
            time.sleep(3)
            counter += 1
            result = delete_method(obj_id)
        if counter == MAX_RETRIES:
            print ("Retries to delete the created resources are exhausted."
                   " Failed to delete.")
    object_list = []
Example #20
0
def delete(object_list):
    """Deletes the objects in object_list using the api delete method

    """

    for obj_id in object_list:
        counter = 0
        delete_method = world.api.deleters[get_resource_type(obj_id)]
        result = delete_method(obj_id)
        while result['code'] != HTTP_NO_CONTENT and counter < MAX_RETRIES:
            print ("Failed to delete %s with code %s. Retrying." %
                   (obj_id, result['code']))
            time.sleep(3)
            counter += 1
            result = delete_method(obj_id)
        if counter == MAX_RETRIES:
            print ("Retries to delete the created resources are exhausted."
                   " Failed to delete.")
    object_list = []
Example #21
0
def get_fields_structure(resource, errors=False):
    """Returns the field structure for a resource, its locale and
       missing_tokens

    """
    try:
        resource_type = get_resource_type(resource)
    except ValueError:
        raise ValueError("Unknown resource structure")
    field_errors = None
    resource = resource.get('object', resource)
    # locale and missing tokens
    if resource_type == SOURCE_PATH:
        resource_locale = resource['source_parser']['locale']
        missing_tokens = resource[
            'source_parser']['missing_tokens']
    else:
        resource_locale = resource.get('locale', DEFAULT_LOCALE)
        missing_tokens = resource.get('missing_tokens',
                                      DEFAULT_MISSING_TOKENS)

    fields = get_fields(resource)
    if resource_type in RESOURCES_WITH_FIELDS:
        # Check whether there's an objective id
        objective_column = None
        if resource_type == DATASET_PATH:
            objective_column = resource.get( \
                'objective_field', {}).get('id')
            if errors:
                field_errors = resource.get("status", {}).get("field_errors")
        elif resource_type in SUPERVISED_PATHS and \
                resource_type != FUSION_PATH:
            objective_id = resource.get( \
                'objective_fields', [None])[0]
            objective_column = fields.get( \
                objective_id, {}).get('column_number')
        result = fields, resource_locale, missing_tokens, objective_column
        if errors:
            result = result + (field_errors,)
        return result
    return (None, None, None, None, None) if errors else \
        (None, None, None, None)
Example #22
0
def retrieve_resource(api, resource_id, query_string=ONLY_MODEL,
                      no_check_fields=False):
    """ Retrieves resource info either from a local repo or
        from the remote server

    """
    if api.storage is not None:
        try:
            stored_resource = "%s%s%s" % (api.storage, os.sep,
                                          resource_id.replace("/", "_"))
            with open(stored_resource) as resource_file:
                resource = json.loads(resource_file.read())
            # we check that the stored resource has enough fields information
            # for local predictions to work. Otherwise we should retrieve it.
            if no_check_fields or check_model_fields(resource):
                return resource
        except ValueError:
            raise ValueError("The file %s contains no JSON")
        except IOError:
            pass
    api_getter = api.getters[get_resource_type(resource_id)]
    resource = check_resource(resource_id, api_getter, query_string)
    return resource
Example #23
0
    def predict_probability(self, input_data,
                            missing_strategy=LAST_PREDICTION,
                            compact=False):

        """For classification models, Predicts a probability for
        each possible output class, based on input values.  The input
        fields must be a dictionary keyed by field name or field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "probability"
                        mapped to the name of the class and it's probability,
                        respectively.  If True, returns a list of probabilities
                        ordered by the sorted order of the class names.
        """
        votes = MultiVoteList([])
        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.fields)

        for models_split in self.models_splits:
            models = []
            for model in models_split:
                if get_resource_type(model) == "fusion":
                    models.append(Fusion(model, api=self.api))
                else:
                    models.append(SupervisedModel(model, api=self.api))
            votes_split = []
            for model in models:
                try:
                    prediction = model.predict_probability( \
                        input_data,
                        missing_strategy=missing_strategy,
                        compact=True)

                except ValueError:
                    # logistic regressions can raise this error if they
                    # have missing_numerics=False and some numeric missings
                    # are found
                    continue
                if self.regression:
                    prediction = prediction[0]
                    if self.weights is not None:
                        prediction = self.weigh(prediction, model.resource_id)
                else:
                    if self.weights is not None:
                        prediction = self.weigh( \
                            prediction, model.resource_id)
                    # we need to check that all classes in the fusion
                    # are also in the composing model
                    if not self.regression and \
                            self.class_names != model.class_names:
                        try:
                            prediction = rearrange_prediction( \
                                model.class_names,
                                self.class_names,
                                prediction)
                        except AttributeError:
                            # class_names should be defined, but just in case
                            pass
                votes_split.append(prediction)


            votes.extend(votes_split)
        if self.regression:
            total_weight = len(votes.predictions) if self.weights is None \
                else sum(self.weights)
            prediction = sum([prediction for prediction in \
                votes.predictions]) / float(total_weight)
            if compact:
                output = [prediction]
            else:
                output = {"prediction": prediction}

        else:
            output = votes.combine_to_distribution(normalize=True)
            if not compact:
                output = [{'category': class_name,
                           'probability': probability}
                          for class_name, probability in
                          zip(self.class_names, output)]

        return output
Example #24
0
    def __init__(self, fusion, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.resource_id = None
        self.models_ids = None
        self.objective_id = None
        self.distribution = None
        self.models_splits = []
        self.cache_get = None
        self.regression = False
        self.fields = None
        self.class_names = None
        self.importance = {}

        self.resource_id, fusion = get_resource_dict( \
            fusion, "fusion", api=self.api)

        if 'object' in fusion:
            fusion = fusion.get('object', {})
        self.model_ids, self.weights = get_models_weight( \
            fusion['models'])
        model_types = [get_resource_type(model) for model in self.model_ids]

        for model_type in model_types:
            if model_type not in LOCAL_SUPERVISED:
                raise ValueError("The resource %s has not an allowed"
                                 " supervised model type.")
        self.importance = fusion.get('importance', [])
        self.missing_numerics = fusion.get('missing_numerics', True)
        if fusion.get('fusion'):
            self.fields = fusion.get( \
                'fusion', {}).get("fields")
            self.objective_id = fusion.get("objective_field")

        number_of_models = len(self.model_ids)

        # Downloading the model information to cache it
        if self.api.storage is not None:
            for model_id in self.model_ids:
                if get_resource_type(model_id) == "fusion":
                    Fusion(model_id, api=self.api)
                else:
                    SupervisedModel(model_id, api=self.api)

        if max_models is None:
            self.models_splits = [self.model_ids]
        else:
            self.models_splits = [self.model_ids[index:(index + max_models)]
                                  for index
                                  in range(0, number_of_models, max_models)]

        if self.fields:
            summary = self.fields[self.objective_id]['summary']
            if 'bins' in summary:
                distribution = summary['bins']
            elif 'counts' in summary:
                distribution = summary['counts']
            elif 'categories' in summary:
                distribution = summary['categories']
            else:
                distribution = []
            self.distribution = distribution

        self.regression = \
            self.fields[self.objective_id].get('optype') == 'numeric'

        if not self.regression:
            objective_field = self.fields[self.objective_id]
            categories = objective_field['summary']['categories']
            classes = [category[0] for category in categories]
            self.class_names = sorted(classes)
            self.objective_categories = [category for \
                category, _ in self.fields[self.objective_id][ \
               "summary"]["categories"]]

        ModelFields.__init__( \
            self, self.fields,
            objective_id=self.objective_id)
Example #25
0
def retrain_model(args, api, command, session_file=None):
    """Retrieve or create the retrain script for a model and
    execute it with the new provided data

    """

    retrain_file = os.path.join(BIGMLER_SCRIPTS_DIRECTORY,
                                "retrain",
                                "scripts")
    try:
        os.remove(UPGRADE_FILE)
        reify_script = None
        try:
            shutil.rmtree(BIGMLER_SCRIPTS_DIRECTORY)
        except OSError:
            pass
    except OSError:
        # look for the script that creates the rebuild script.
        reify_script = get_script_id(retrain_file)

    if reify_script is None:
        # new bigmler command: creating the scriptify scripts
        whizzml_command = ['whizzml',
                           '--package-dir', INCREMENTAL_PACKAGE_PATH,
                           '--output-dir', BIGMLER_SCRIPTS_DIRECTORY]
        add_api_context(whizzml_command, args)
        whizzml_dispatcher(args=whizzml_command)
        reify_script = get_script_id(retrain_file)

    # retrieve the modeling resource to be retrained by tag or id
    if args.resource_id:
        resource_id = args.resource_id
        reference_tag = "retrain:%s" % resource_id
    else:
        for model_type in MODEL_TYPES:
            if hasattr(args, "%s_tag" % model_type) and \
                    getattr(args, "%s_tag" % model_type) is not None:
                tag = getattr(args, "%s_tag" % model_type)
                query_string = "tags=%s" % tag
                resource_id = get_first_resource( \
                    model_type.replace("_", ""),
                    api=api,
                    query_string=query_string)
                if resource_id is None:
                    sys.exit("Failed to find the %s with tag %s. "
                             "Please, check the tag and"
                             " the connection info (domain and credentials)." %
                             (model_type.replace("_", " "), tag))
                reference_tag = tag
                break
    # updating the dataset that generated the model with the reference tag
    model = api.getters[get_resource_type(resource_id)](resource_id)
    dataset_id = model["object"]["dataset"]
    dataset = api.get_dataset(dataset_id)
    tags = dataset["object"]["tags"]
    if reference_tag not in tags:
        tags.append(reference_tag)
        api.update_dataset(dataset_id, {"tags": tags})

    # if --upgrade, we force rebuilding the scriptified script
    if args.upgrade:
        script_id = None
    else:
        # check for the last script used to retrain the model
        query_string = "tags=%s" % reference_tag
        script_id = get_last_resource( \
            "script",
            api=api,
            query_string=query_string)

    if script_id is None:
        # if the script to retrain does not exist:

        # check whether the model exists
        try:
            bigml.api.check_resource(resource_id, raise_on_error=True, api=api)
        except Exception, exc:
            sys.exit("Failed to find the model %s. Please, check its ID and"
                     " the connection info (domain and credentials)." %
                     resource_id)

        # new bigmler command: creating the retrain script
        execute_command = ['execute',
                           '--script', reify_script,
                           '--tag', reference_tag,
                           '--output-dir', args.output_dir]
        command.propagate(execute_command)
        command_args, _, _, exe_session_file, _ = get_context(execute_command,
                                                              EXE_SETTINGS)
        command_args.arguments_ = [["model-resource", resource_id]]
        command_args.inputs = json.dumps(command_args.arguments_)

        # process the command
        execute_whizzml(command_args, api, session_file)
        script_id = extract_retrain_id(command_args, api, session_file)