def __init__(self, anomaly, api=None): self.resource_id = None self.sample_size = None self.input_fields = None self.mean_depth = None self.expected_mean_depth = None self.iforest = None self.top_anomalies = None self.id_fields = [] if not (isinstance(anomaly, dict) and 'resource' in anomaly and anomaly['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_anomaly_id(anomaly) if self.resource_id is None: raise Exception(api.error_message(anomaly, resource_type='anomaly', method='get')) query_string = ONLY_MODEL anomaly = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_anomaly_id(anomaly) if 'object' in anomaly and isinstance(anomaly['object'], dict): anomaly = anomaly['object'] self.sample_size = anomaly.get('sample_size') self.input_fields = anomaly.get('input_fields') self.id_fields = anomaly.get('id_fields', []) if 'model' in anomaly and isinstance(anomaly['model'], dict): ModelFields.__init__(self, anomaly['model'].get('fields')) if ('top_anomalies' in anomaly['model'] and isinstance(anomaly['model']['top_anomalies'], list)): self.mean_depth = anomaly['model'].get('mean_depth') status = get_status(anomaly) if 'code' in status and status['code'] == FINISHED: self.expected_mean_depth = None if self.mean_depth is None or self.sample_size is None: raise Exception("The anomaly data is not complete. " "Score will" " not be available") else: default_depth = ( 2 * (DEPTH_FACTOR + \ math.log(self.sample_size - 1) - \ (float(self.sample_size - 1) / self.sample_size))) self.expected_mean_depth = min(self.mean_depth, default_depth) iforest = anomaly['model'].get('trees', []) if iforest: self.iforest = [ AnomalyTree(anomaly_tree['root'], self.fields) for anomaly_tree in iforest] self.top_anomalies = anomaly['model']['top_anomalies'] else: raise Exception("The anomaly isn't finished yet") else: raise Exception("Cannot create the Anomaly instance. Could not" " find the 'top_anomalies' key in the" " resource:\n\n%s" % anomaly['model'].keys())
def get_fusion_resource(self, fusion): """Extracts the fusion resource info. The fusion argument can be - a path to a local file - an fusion id """ # the string can be a path to a JSON file if isinstance(fusion, basestring): try: with open(fusion) as fusion_file: fusion = json.load(fusion_file) self.resource_id = get_fusion_id(fusion) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML fusion" " representation.") except IOError: # if it is not a path, it can be an fusion id self.resource_id = get_fusion_id(fusion) if self.resource_id is None: if fusion.find('fusion/') > -1: raise Exception( self.api.error_message(fusion, resource_type='fusion', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % fusion) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") if not isinstance(fusion, dict): fusion = retrieve_resource(self.api, self.resource_id, no_check_fields=False) return fusion
def __init__(self, cluster, api=None): if not (isinstance(cluster, dict) and 'resource' in cluster and cluster['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_cluster_id(cluster) if self.resource_id is None: raise Exception(api.error_message(cluster, resource_type='cluster', method='get')) query_string = ONLY_MODEL cluster = retrieve_resource(api, self.resource_id, query_string=query_string) if 'object' in cluster and isinstance(cluster['object'], dict): cluster = cluster['object'] if 'clusters' in cluster and isinstance(cluster['clusters'], dict): status = get_status(cluster) if 'code' in status and status['code'] == FINISHED: clusters = cluster['clusters']['clusters'] self.centroids = [Centroid(centroid) for centroid in clusters] self.scales = {} self.scales.update(cluster['scales']) self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} fields = cluster['clusters']['fields'] for field_id, field in fields.items(): if field['optype'] == 'text': self.term_forms[field_id] = {} self.term_forms[field_id].update(field[ 'summary']['term_forms']) self.tag_clouds[field_id] = {} self.tag_clouds[field_id].update(field[ 'summary']['tag_cloud']) self.term_analysis[field_id] = {} self.term_analysis[field_id].update( field['term_analysis']) ModelFields.__init__(self, fields) if not all([field_id in self.fields for field_id in self.scales]): raise Exception("Some fields are missing" " to generate a local cluster." " Please, provide a cluster with" " the complete list of fields.") else: raise Exception("The cluster isn't finished yet") else: raise Exception("Cannot create the Cluster instance. Could not" " find the 'clusters' key in the resource:\n\n%s" % cluster)
def get_resource(self, resource_id): """Auxiliar method to retrieve resources. The query string ensures low bandwith usage and full fields structure. """ if (resource_id and not isinstance(resource_id, basestring) and isinstance(resource_id, list)): resource_id = resource_id[0] try: resource = retrieve_resource(self.api, resource_id, query_string=GET_QS).get('object') return resource except ValueError: sys.exit("We could not reify the resource. Failed to find" " information for %s in the" " creation chain." % resource_id)
def __init__(self, model, api=None): if not (isinstance(model, dict) and "resource" in model and model["resource"] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_model_id(model) if self.resource_id is None: raise Exception(api.error_message(model, resource_type="model", method="get")) query_string = ONLY_MODEL model = retrieve_resource(api, self.resource_id, query_string=query_string) BaseModel.__init__(self, model, api=api) if "object" in model and isinstance(model["object"], dict): model = model["object"] if "model" in model and isinstance(model["model"], dict): status = get_status(model) if "code" in status and status["code"] == FINISHED: distribution = model["model"]["distribution"]["training"] self.ids_map = {} self.tree = Tree( model["model"]["root"], self.fields, objective_field=self.objective_id, root_distribution=distribution, parent_id=None, ids_map=self.ids_map, ) self.terms = {} else: raise Exception("The model isn't finished yet") else: raise Exception( "Cannot create the Model instance. Could not" " find the 'model' key in the resource:\n\n%s" % model ) if self.tree.regression: try: import numpy import scipy self.regression_ready = True except ImportError: self.regression_ready = False
def __init__(self, cluster, api=None): self.resource_id = None self.centroids = None self.cluster_global = None self.total_ss = None self.within_ss = None self.between_ss = None self.ratio_ss = None self.critical_value = None self.k = None self.scales = {} self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.item_analysis = {} self.items = {} if not (isinstance(cluster, dict) and 'resource' in cluster and cluster['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_cluster_id(cluster) if self.resource_id is None: raise Exception(api.error_message(cluster, resource_type='cluster', method='get')) query_string = ONLY_MODEL cluster = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_cluster_id(cluster) if 'object' in cluster and isinstance(cluster['object'], dict): cluster = cluster['object'] if 'clusters' in cluster and isinstance(cluster['clusters'], dict): status = get_status(cluster) if 'code' in status and status['code'] == FINISHED: the_clusters = cluster['clusters'] cluster_global = the_clusters.get('global') clusters = the_clusters['clusters'] self.centroids = [Centroid(centroid) for centroid in clusters] self.cluster_global = cluster_global if cluster_global: self.cluster_global = Centroid(cluster_global) # "global" has no "name" and "count" then we set them self.cluster_global.name = GLOBAL_CLUSTER_LABEL self.cluster_global.count = \ self.cluster_global.distance['population'] self.total_ss = the_clusters.get('total_ss') self.within_ss = the_clusters.get('within_ss') if not self.within_ss: self.within_ss = sum(centroid.distance['sum_squares'] for centroid in self.centroids) self.between_ss = the_clusters.get('between_ss') self.ratio_ss = the_clusters.get('ratio_ss') self.critical_value = cluster.get('critical_value', None) self.k = cluster.get('k') self.scales.update(cluster['scales']) self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} fields = cluster['clusters']['fields'] summary_fields = cluster['summary_fields'] for field_id in summary_fields: del fields[field_id] for field_id, field in fields.items(): if field['optype'] == 'text': self.term_forms[field_id] = {} self.term_forms[field_id].update(field[ 'summary']['term_forms']) self.tag_clouds[field_id] = {} self.tag_clouds[field_id].update(field[ 'summary']['tag_cloud']) self.term_analysis[field_id] = {} self.term_analysis[field_id].update( field['term_analysis']) if field['optype'] == 'items': self.items[field_id] = {} self.items[field_id].update( dict(field['summary']['items'])) self.item_analysis[field_id] = {} self.item_analysis[field_id].update( field['item_analysis']) ModelFields.__init__(self, fields) if not all([field_id in self.fields for field_id in self.scales]): raise Exception("Some fields are missing" " to generate a local cluster." " Please, provide a cluster with" " the complete list of fields.") else: raise Exception("The cluster isn't finished yet") else: raise Exception("Cannot create the Cluster instance. Could not" " find the 'clusters' key in the resource:\n\n%s" % cluster)
def __init__(self, topic_model, api=None): self.resource_id = None self.stemmer = None self.seed = None self.case_sensitive = False self.bigrams = False self.ntopics = None self.temp = None self.phi = None self.term_to_index = None self.topics = [] if not (isinstance(topic_model, dict) and 'resource' in topic_model and topic_model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_topic_model_id(topic_model) if self.resource_id is None: raise Exception(api.error_message(topic_model, resource_type='topicmodel', method='get')) query_string = ONLY_MODEL topic_model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_topic_model_id(topic_model) if 'object' in topic_model and isinstance(topic_model['object'], dict): topic_model = topic_model['object'] if 'topic_model' in topic_model \ and isinstance(topic_model['topic_model'], dict): status = get_status(topic_model) if 'code' in status and status['code'] == FINISHED: model = topic_model['topic_model'] self.topics = model['topics'] if 'language' in model and model['language'] is not None: lang = model['language'] if lang in CODE_TO_NAME: self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang]) self.term_to_index = {self.stem(term): index for index, term in enumerate(model['termset'])} self.seed = abs(model['hashed_seed']) self.case_sensitive = model['case_sensitive'] self.bigrams = model['bigrams'] self.ntopics = len(model['term_topic_assignments'][0]) self.alpha = model['alpha'] self.ktimesalpha = self.ntopics * self.alpha self.temp = [0] * self.ntopics assignments = model['term_topic_assignments'] beta = model['beta'] nterms = len(self.term_to_index) sums = [sum(n[index] for n in assignments) for index in range(self.ntopics)] self.phi = [[0 for _ in range(nterms)] for _ in range(self.ntopics)] for k in range(self.ntopics): norm = sums[k] + nterms * beta for w in range(nterms): self.phi[k][w] = (assignments[w][k] + beta) / norm ModelFields.__init__(self, model['fields']) else: raise Exception("The topic model isn't finished yet") else: raise Exception("Cannot create the topic model instance. Could not" " find the 'topic_model' key in the" " resource:\n\n%s" % topic_model)
def __init__(self, logistic_regression, api=None): self.resource_id = None self.class_names = None self.input_fields = [] self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.items = {} self.item_analysis = {} self.categories = {} self.coefficients = {} self.data_field_types = {} self.field_codings = {} self.numeric_fields = {} self.bias = None self.missing_numerics = None self.c = None self.eps = None self.lr_normalize = None self.balance_fields = None self.regularization = None old_coefficients = False # checks whether the information needed for local predictions is in # the first argument if isinstance(logistic_regression, dict) and \ not check_model_fields(logistic_regression): # if the fields used by the logistic regression are not # available, use only ID to retrieve it again logistic_regression = get_logistic_regression_id( \ logistic_regression) self.resource_id = logistic_regression if not (isinstance(logistic_regression, dict) and 'resource' in logistic_regression and logistic_regression['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_logistic_regression_id(logistic_regression) if self.resource_id is None: raise Exception( api.error_message(logistic_regression, resource_type='logistic_regression', method='get')) query_string = ONLY_MODEL logistic_regression = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_logistic_regression_id(logistic_regression) if 'object' in logistic_regression and \ isinstance(logistic_regression['object'], dict): logistic_regression = logistic_regression['object'] try: self.input_fields = logistic_regression.get("input_fields", []) self.dataset_field_types = logistic_regression.get( "dataset_field_types", {}) objective_field = logistic_regression['objective_fields'] if \ logistic_regression['objective_fields'] else \ logistic_regression['objective_field'] except KeyError: raise ValueError("Failed to find the logistic regression expected " "JSON structure. Check your arguments.") if 'logistic_regression' in logistic_regression and \ isinstance(logistic_regression['logistic_regression'], dict): status = get_status(logistic_regression) if 'code' in status and status['code'] == FINISHED: logistic_regression_info = logistic_regression[ \ 'logistic_regression'] fields = logistic_regression_info.get('fields', {}) if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(self.fields.items(), key=lambda x: x[1].get("column_number"))] self.coefficients.update(logistic_regression_info.get( \ 'coefficients', [])) if not isinstance(self.coefficients.values()[0][0], list): old_coefficients = True self.bias = logistic_regression_info.get('bias', True) self.c = logistic_regression_info.get('c') self.eps = logistic_regression_info.get('eps') self.lr_normalize = logistic_regression_info.get('normalize') self.balance_fields = logistic_regression_info.get( \ 'balance_fields') self.regularization = logistic_regression_info.get( \ 'regularization') self.field_codings = logistic_regression_info.get( \ 'field_codings', {}) # old models have no such attribute, so we set it to False in # this case self.missing_numerics = logistic_regression_info.get( \ 'missing_numerics', False) objective_id = extract_objective(objective_field) ModelFields.__init__(self, fields, objective_id=objective_id, terms=True, categories=True, numerics=True) self.field_codings = logistic_regression_info.get( \ 'field_codings', {}) self.format_field_codings() for field_id in self.field_codings: if field_id not in fields and \ field_id in self.inverted_fields: self.field_codings.update( \ {self.inverted_fields[field_id]: \ self.field_codings[field_id]}) del self.field_codings[field_id] if old_coefficients: self.map_coefficients() categories = self.fields[self.objective_id].get( \ "summary", {}).get('categories') if len(self.coefficients.keys()) > len(categories): self.class_names = [""] else: self.class_names = [] self.class_names.extend( sorted([category[0] for category in categories])) else: raise Exception("The logistic regression isn't finished yet") else: raise Exception("Cannot create the LogisticRegression instance." " Could not find the 'logistic_regression' key" " in the resource:\n\n%s" % logistic_regression)
def __init__(self, time_series, api=None): self.resource_id = None self.input_fields = [] self.objective_fields = [] self.all_numeric_objectives = False self.period = 1 self.ets_models = {} self.error = None self.damped_trend = None self.seasonality = None self.trend = None self.time_range = {} self.field_parameters = {} self._forecast = {} # checks whether the information needed for local predictions is in # the first argument if isinstance(time_series, dict) and \ not check_model_fields(time_series): # if the fields used by the logistic regression are not # available, use only ID to retrieve it again time_series = get_time_series_id( \ time_series) self.resource_id = time_series if not (isinstance(time_series, dict) and 'resource' in time_series and time_series['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_time_series_id(time_series) if self.resource_id is None: raise Exception( api.error_message(time_series, resource_type='time_series', method='get')) query_string = ONLY_MODEL time_series = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_time_series_id(time_series) if 'object' in time_series and \ isinstance(time_series['object'], dict): time_series = time_series['object'] try: self.input_fields = time_series.get("input_fields", []) self._forecast = time_series.get("forecast") self.objective_fields = time_series.get("objective_fields", []) objective_field = time_series['objective_field'] if \ time_series.get('objective_field') else \ time_series['objective_fields'] except KeyError: raise ValueError("Failed to find the time series expected " "JSON structure. Check your arguments.") if 'time_series' in time_series and \ isinstance(time_series['time_series'], dict): status = get_status(time_series) if 'code' in status and status['code'] == FINISHED: time_series_info = time_series['time_series'] fields = time_series_info.get('fields', {}) self.fields = fields if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(self.fields.items(), key=lambda x: x[1].get("column_number"))] self.all_numeric_objectives = time_series_info.get( \ 'all_numeric_objectives') self.period = time_series_info.get('period', 1) self.ets_models = time_series_info.get('ets_models', {}) self.error = time_series_info.get('error') self.damped_trend = time_series_info.get('damped_trend') self.seasonality = time_series_info.get('seasonality') self.trend = time_series_info.get('trend') self.time_range = time_series_info.get('time_range') self.field_parameters = time_series_info.get( \ 'field_parameters', {}) objective_id = extract_objective(objective_field) ModelFields.__init__(self, fields, objective_id=objective_id) else: raise Exception("The time series isn't finished yet") else: raise Exception("Cannot create the TimeSeries instance." " Could not find the 'time_series' key" " in the resource:\n\n%s" % time_series)
model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage='./storage'), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an
def __init__(self, ensemble, model_fns_dir, api=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.resource_id = None # to be deprecated self.ensemble_id = None self.objective_id = None self.distributions = None self.distribution = None self.models_splits = [] self.multi_model = None self.boosting = None self.boosting_offsets = None self.regression = False self.fields = None self.class_names = None self.importance = {} self.predict_functions = [] ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) self.ensemble_id = self.resource_id if lacks_info(ensemble, inner_key="ensemble"): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") self.input_fields = ensemble['object'].get("input_fields") if model_fns_dir: self.get_model_fns(model_fns_dir) else: raise ValueError("The EnsemblePredictor object expects as" " argument the directory where the models" " predict functions are stored. To generate " " them, please check the 'bigmler export'" " command.") if self.fields: summary = self.fields[self.objective_id]['summary'] if 'bins' in summary: distribution = summary['bins'] elif 'counts' in summary: distribution = summary['counts'] elif 'categories' in summary: distribution = summary['categories'] else: distribution = [] self.distribution = distribution self.regression = \ self.fields[self.objective_id].get('optype') == 'numeric' if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression and self.boosting is None: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes)
def __init__(self, association, api=None): self.resource_id = None self.complement = None self.discretization = {} self.field_discretizations = {} self.items = [] self.max_k = None self.max_lhs = None self.min_confidence = None self.min_leverage = None self.min_support = None self.min_lift = None self.search_strategy = DEFAULT_SEARCH_STRATEGY self.rules = [] self.significance_level = None if not (isinstance(association, dict) and 'resource' in association and association['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_association_id(association) if self.resource_id is None: raise Exception(api.error_message(association, resource_type='association', method='get')) query_string = ONLY_MODEL association = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_association_id(association) if 'object' in association and isinstance(association['object'], dict): association = association['object'] if 'associations' in association and \ isinstance(association['associations'], dict): status = get_status(association) if 'code' in status and status['code'] == FINISHED: associations = association['associations'] fields = associations['fields'] ModelFields.__init__(self, fields) self.complement = associations.get('complement', False) self.discretization = associations.get('discretization', {}) self.field_discretizations = associations.get( 'field_discretizations', {}) self.items = [Item(index, item, fields) for index, item in enumerate(associations.get('items', []))] self.max_k = associations.get('max_k', 100) self.max_lhs = associations.get('max_lhs', 4) self.min_confidence = associations.get('min_confidence', 0) self.min_leverage = associations.get('min_leverage', -1) self.min_support = associations.get('min_support', 0) self.min_lift = associations.get('min_lift', 0) self.search_strategy = associations.get('search_strategy', \ DEFAULT_SEARCH_STRATEGY) self.rules = [AssociationRule(rule) for rule in associations.get('rules', [])] self.significance_level = associations.get( 'significance_level', 0.05) else: raise Exception("The association isn't finished yet") else: raise Exception("Cannot create the Association instance. Could not" " find the 'associations' key in the " "resource:\n\n%s" % association)
def __init__(self, topic_model, api=None): self.resource_id = None self.stemmer = None self.seed = None self.case_sensitive = False self.bigrams = False self.ntopics = None self.temp = None self.phi = None self.term_to_index = None self.topics = [] if not (isinstance(topic_model, dict) and 'resource' in topic_model and topic_model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_topic_model_id(topic_model) if self.resource_id is None: raise Exception( api.error_message(topic_model, resource_type='topicmodel', method='get')) query_string = ONLY_MODEL topic_model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_topic_model_id(topic_model) if 'object' in topic_model and isinstance(topic_model['object'], dict): topic_model = topic_model['object'] if 'topic_model' in topic_model \ and isinstance(topic_model['topic_model'], dict): status = get_status(topic_model) if 'code' in status and status['code'] == FINISHED: model = topic_model['topic_model'] self.topics = model['topics'] if 'language' in model and model['language'] is not None: lang = model['language'] if lang in CODE_TO_NAME: self.stemmer = Stemmer.Stemmer(CODE_TO_NAME[lang]) self.term_to_index = { self.stem(term): index for index, term in enumerate(model['termset']) } self.seed = abs(model['hashed_seed']) self.case_sensitive = model['case_sensitive'] self.bigrams = model['bigrams'] self.ntopics = len(model['term_topic_assignments'][0]) self.alpha = model['alpha'] self.ktimesalpha = self.ntopics * self.alpha self.temp = [0] * self.ntopics assignments = model['term_topic_assignments'] beta = model['beta'] nterms = len(self.term_to_index) sums = [ sum(n[index] for n in assignments) for index in range(self.ntopics) ] self.phi = [[0 for _ in range(nterms)] for _ in range(self.ntopics)] for k in range(self.ntopics): norm = sums[k] + nterms * beta for w in range(nterms): self.phi[k][w] = (assignments[w][k] + beta) / norm ModelFields.__init__(self, model['fields']) else: raise Exception("The topic model isn't finished yet") else: raise Exception("Cannot create the topic model instance. Could not" " find the 'topic_model' key in the" " resource:\n\n%s" % topic_model)
def __init__(self, time_series, api=None): self.resource_id = None self.input_fields = [] self.objective_fields = [] self.all_numeric_objectives = False self.period = 1 self.ets_models = {} self.error = None self.damped_trend = None self.seasonality = None self.trend = None self.time_range = {} self.field_parameters = {} self._forecast = [] # checks whether the information needed for local predictions is in # the first argument if isinstance(time_series, dict) and \ not check_model_fields(time_series): # if the fields used by the logistic regression are not # available, use only ID to retrieve it again time_series = get_time_series_id( \ time_series) self.resource_id = time_series if not (isinstance(time_series, dict) and 'resource' in time_series and time_series['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_time_series_id(time_series) if self.resource_id is None: raise Exception( api.error_message(time_series, resource_type='time_series', method='get')) query_string = ONLY_MODEL time_series = retrieve_resource( api, self.resource_id, query_string=query_string) else: self.resource_id = get_time_series_id(time_series) if 'object' in time_series and \ isinstance(time_series['object'], dict): time_series = time_series['object'] try: self.input_fields = time_series.get("input_fields", []) self._forecast = time_series.get("forecast") self.objective_fields = time_series.get( "objective_fields", []) objective_field = time_series['objective_field'] if \ time_series.get('objective_field') else \ time_series['objective_fields'] except KeyError: raise ValueError("Failed to find the time series expected " "JSON structure. Check your arguments.") if 'time_series' in time_series and \ isinstance(time_series['time_series'], dict): status = get_status(time_series) if 'code' in status and status['code'] == FINISHED: time_series_info = time_series['time_series'] fields = time_series_info.get('fields', {}) self.fields = fields if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(self.fields.items(), key=lambda x: x[1].get("column_number"))] self.all_numeric_objectives = time_series_info.get( \ 'all_numeric_objectives') self.period = time_series_info.get('period', 1) self.ets_models = time_series_info.get('ets_models', {}) self.error = time_series_info.get('error') self.damped_trend = time_series_info.get('damped_trend') self.seasonality = time_series_info.get('seasonality') self.trend = time_series_info.get('trend') self.time_range = time_series_info.get('time_range') self.field_parameters = time_series_info.get( \ 'field_parameters', {}) objective_id = extract_objective(objective_field) ModelFields.__init__( self, fields, objective_id=objective_id) else: raise Exception("The time series isn't finished yet") else: raise Exception("Cannot create the TimeSeries instance." " Could not find the 'time_series' key" " in the resource:\n\n%s" % time_series)
def __init__(self, model, api=None, fields=None): """The Model constructor can be given as first argument: - a model structure - a model id - a path to a JSON file containing a model structure """ self.resource_id = None self.ids_map = {} self.terms = {} self.regression = False self.boosting = None self.class_names = None if not hasattr(self, 'tree_class'): self.tree_class = Tree # the string can be a path to a JSON file if isinstance(model, basestring): try: with open(model) as model_file: model = json.load(model_file) self.resource_id = get_model_id(model) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML model" " representation.") except IOError: # if it is not a path, it can be a model id self.resource_id = get_model_id(model) if self.resource_id is None: if model.find('model/') > -1: raise Exception( api.error_message(model, resource_type='model', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % model) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") # checks whether the information needed for local predictions is in # the first argument if isinstance(model, dict) and \ not fields and \ not check_model_fields(model): # if the fields used by the model are not # available, use only ID to retrieve it again model = get_model_id(model) self.resource_id = model if not (isinstance(model, dict) and 'resource' in model and model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) if fields is not None and isinstance(fields, dict): query_string = EXCLUDE_FIELDS else: query_string = ONLY_MODEL model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_model_id(model) BaseModel.__init__(self, model, api=api, fields=fields) if 'object' in model and isinstance(model['object'], dict): model = model['object'] if 'model' in model and isinstance(model['model'], dict): status = get_status(model) if 'code' in status and status['code'] == FINISHED: # boosting models are to be handled using the BoostedTree # class if model.get("boosted_ensemble"): self.boosting = model.get('boosting', False) if self.boosting == {}: self.boosting = False self.regression = \ not self.boosting and \ self.fields[self.objective_id]['optype'] == 'numeric' \ or (self.boosting and \ self.boosting.get("objective_class") is None) if self.boosting: self.tree = BoostedTree( model['model']['root'], self.fields, objective_field=self.objective_id) else: distribution = model['model']['distribution']['training'] # will store global information in the tree: regression and # max_bins number tree_info = {'max_bins': 0} self.tree = self.tree_class( model['model']['root'], self.fields, objective_field=self.objective_id, root_distribution=distribution, parent_id=None, ids_map=self.ids_map, tree_info=tree_info) self.tree.regression = tree_info['regression'] if self.tree.regression: try: import numpy import scipy self._max_bins = tree_info['max_bins'] self.regression_ready = True except ImportError: self.regression_ready = False else: root_dist = self.tree.distribution self.class_names = sorted([category[0] for category in root_dist]) else: raise Exception("The model isn't finished yet") else: raise Exception("Cannot create the Model instance. Could not" " find the 'model' key in the resource:\n\n%s" % model)
def __init__(self, deepnet, api=None): """The Deepnet constructor can be given as first argument: - a deepnet structure - a deepnet id - a path to a JSON file containing a deepnet structure """ self.resource_id = None self.regression = False self.network = None self.networks = None self.input_fields = [] self.class_names = [] self.preprocess = [] self.optimizer = None self.missing_numerics = False # the string can be a path to a JSON file if isinstance(deepnet, basestring): try: with open(deepnet) as deepnet_file: deepnet = json.load(deepnet_file) self.resource_id = get_deepnet_id(deepnet) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML deepnet" " representation.") except IOError: # if it is not a path, it can be a deepnet id self.resource_id = get_deepnet_id(deepnet) if self.resource_id is None: if deepnet.find('deepnet/') > -1: raise Exception( api.error_message(deepnet, resource_type='deepnet', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % deepnet) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") # checks whether the information needed for local predictions is in # the first argument if isinstance(deepnet, dict) and \ not check_model_fields(deepnet): # if the fields used by the deepenet are not # available, use only ID to retrieve it again deepnet = get_deepnet_id(deepnet) self.resource_id = deepnet if not (isinstance(deepnet, dict) and 'resource' in deepnet and deepnet['resource'] is not None): if api is None: api = BigML(storage=STORAGE) query_string = ONLY_MODEL deepnet = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_deepnet_id(deepnet) if 'object' in deepnet and isinstance(deepnet['object'], dict): deepnet = deepnet['object'] self.input_fields = deepnet['input_fields'] if 'deepnet' in deepnet and isinstance(deepnet['deepnet'], dict): status = get_status(deepnet) objective_field = deepnet['objective_fields'] deepnet = deepnet['deepnet'] if 'code' in status and status['code'] == FINISHED: self.fields = deepnet['fields'] ModelFields.__init__( self, self.fields, objective_id=extract_objective(objective_field), terms=True, categories=True) self.regression = \ self.fields[self.objective_id]['optype'] == NUMERIC if not self.regression: self.class_names = [category for category,_ in \ self.fields[self.objective_id][ \ 'summary']['categories']] self.class_names.sort() self.missing_numerics = deepnet.get('missing_numerics', False) if 'network' in deepnet: network = deepnet['network'] self.network = network self.networks = network.get('networks', []) self.preprocess = network.get('preprocess') self.optimizer = network.get('optimizer', {}) else: raise Exception("The deepnet isn't finished yet") else: raise Exception("Cannot create the Deepnet instance. Could not" " find the 'deepnet' key in the resource:\n\n%s" % deepnet)
def __init__(self, association, api=None): self.resource_id = None self.complement = None self.discretization = {} self.field_discretizations = {} self.items = [] self.max_k = None self.max_lhs = None self.min_confidence = None self.min_leverage = None self.min_support = None self.min_lift = None self.search_strategy = DEFAULT_SEARCH_STRATEGY self.rules = [] self.significance_level = None if not (isinstance(association, dict) and 'resource' in association and association['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_association_id(association) if self.resource_id is None: raise Exception( api.error_message(association, resource_type='association', method='get')) query_string = ONLY_MODEL association = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_association_id(association) if 'object' in association and isinstance(association['object'], dict): association = association['object'] if 'associations' in association and \ isinstance(association['associations'], dict): status = get_status(association) if 'code' in status and status['code'] == FINISHED: associations = association['associations'] fields = associations['fields'] ModelFields.__init__(self, fields) self.complement = associations.get('complement', False) self.discretization = associations.get('discretization', {}) self.field_discretizations = associations.get( 'field_discretizations', {}) self.items = [ Item(index, item, fields) for index, item in enumerate(associations.get('items', [])) ] self.max_k = associations.get('max_k', 100) self.max_lhs = associations.get('max_lhs', 4) self.min_confidence = associations.get('min_confidence', 0) self.min_leverage = associations.get('min_leverage', -1) self.min_support = associations.get('min_support', 0) self.min_lift = associations.get('min_lift', 0) self.search_strategy = associations.get('search_strategy', \ DEFAULT_SEARCH_STRATEGY) self.rules = [ AssociationRule(rule) for rule in associations.get('rules', []) ] self.significance_level = associations.get( 'significance_level', 0.05) else: raise Exception("The association isn't finished yet") else: raise Exception("Cannot create the Association instance. Could not" " find the 'associations' key in the " "resource:\n\n%s" % association)
def __init__(self, model, api=None): """The Model constructor can be given as first argument: - a model structure - a model id - a path to a JSON file containing a model structure """ self.resource_id = None self.ids_map = {} self.terms = {} # the string can be a path to a JSON file if isinstance(model, basestring): try: with open(model) as model_file: model = json.load(model_file) self.resource_id = get_model_id(model) if self.resource_id is None: raise ValueError( "The JSON file does not seem" " to contain a valid BigML model" " representation." ) except IOError: # if it is not a path, it can be a model id self.resource_id = get_model_id(model) if self.resource_id is None: if model.find("model/") > -1: raise Exception(api.error_message(model, resource_type="model", method="get")) else: raise IOError("Failed to open the expected JSON file" " at %s" % model) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") # checks whether the information needed for local predictions is in # the first argument if isinstance(model, dict) and not check_model_fields(model): # if the fields used by the model are not # available, use only ID to retrieve it again model = get_model_id(model) self.resource_id = model if not (isinstance(model, dict) and "resource" in model and model["resource"] is not None): if api is None: api = BigML(storage=STORAGE) query_string = ONLY_MODEL model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_model_id(model) BaseModel.__init__(self, model, api=api) if "object" in model and isinstance(model["object"], dict): model = model["object"] if "model" in model and isinstance(model["model"], dict): status = get_status(model) if "code" in status and status["code"] == FINISHED: distribution = model["model"]["distribution"]["training"] # will store global information in the tree: regression and # max_bins number tree_info = {"max_bins": 0} self.tree = Tree( model["model"]["root"], self.fields, objective_field=self.objective_id, root_distribution=distribution, parent_id=None, ids_map=self.ids_map, tree_info=tree_info, ) self.tree.regression = tree_info["regression"] if self.tree.regression: self._max_bins = tree_info["max_bins"] else: raise Exception("The model isn't finished yet") else: raise Exception( "Cannot create the Model instance. Could not" " find the 'model' key in the resource:\n\n%s" % model ) if self.tree.regression: try: import numpy import scipy self.regression_ready = True except ImportError: self.regression_ready = False
def __init__(self, model, api=None): """The Model constructor can be given as first argument: - a model structure - a model id - a path to a JSON file containing a model structure """ self.resource_id = None self.ids_map = {} self.terms = {} # the string can be a path to a JSON file if isinstance(model, basestring): try: with open(model) as model_file: model = json.load(model_file) self.resource_id = get_model_id(model) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML model" " representation.") except IOError: # if it is not a path, it can be a model id self.resource_id = get_model_id(model) if self.resource_id is None: if model.find('model/') > -1: raise Exception( api.error_message(model, resource_type='model', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % model) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") if not (isinstance(model, dict) and 'resource' in model and model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) query_string = ONLY_MODEL model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_model_id(model) BaseModel.__init__(self, model, api=api) if 'object' in model and isinstance(model['object'], dict): model = model['object'] if 'model' in model and isinstance(model['model'], dict): status = get_status(model) if 'code' in status and status['code'] == FINISHED: distribution = model['model']['distribution']['training'] self.tree = Tree( model['model']['root'], self.fields, objective_field=self.objective_id, root_distribution=distribution, parent_id=None, ids_map=self.ids_map) else: raise Exception("The model isn't finished yet") else: raise Exception("Cannot create the Model instance. Could not" " find the 'model' key in the resource:\n\n%s" % model) if self.tree.regression: try: import numpy import scipy self.regression_ready = True except ImportError: self.regression_ready = False
def __init__(self, anomaly, api=None): self.resource_id = None self.sample_size = None self.input_fields = None self.mean_depth = None self.expected_mean_depth = None self.iforest = None self.top_anomalies = None if not (isinstance(anomaly, dict) and 'resource' in anomaly and anomaly['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_anomaly_id(anomaly) if self.resource_id is None: raise Exception( api.error_message(anomaly, resource_type='anomaly', method='get')) query_string = ONLY_MODEL anomaly = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_anomaly_id(anomaly) if 'object' in anomaly and isinstance(anomaly['object'], dict): anomaly = anomaly['object'] self.sample_size = anomaly.get('sample_size') self.input_fields = anomaly.get('input_fields') if 'model' in anomaly and isinstance(anomaly['model'], dict): ModelFields.__init__(self, anomaly['model'].get('fields')) if ('top_anomalies' in anomaly['model'] and isinstance(anomaly['model']['top_anomalies'], list)): self.mean_depth = anomaly['model'].get('mean_depth') status = get_status(anomaly) if 'code' in status and status['code'] == FINISHED: self.expected_mean_depth = None if self.mean_depth is None or self.sample_size is None: raise Exception("The anomaly data is not complete. " "Score will" " not be available") else: default_depth = ( 2 * (0.5772156649 + math.log(self.sample_size - 1) - (float(self.sample_size - 1) / self.sample_size))) self.expected_mean_depth = min(self.mean_depth, default_depth) iforest = anomaly['model'].get('trees', []) if iforest: self.iforest = [ AnomalyTree(anomaly_tree['root'], self.fields) for anomaly_tree in iforest ] self.top_anomalies = anomaly['model']['top_anomalies'] else: raise Exception("The anomaly isn't finished yet") else: raise Exception("Cannot create the Anomaly instance. Could not" " find the 'top_anomalies' key in the" " resource:\n\n%s" % anomaly['model'].keys())
def __init__(self, logistic_regression, api=None): self.resource_id = None self.input_fields = [] self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.items = {} self.item_analysis = {} self.categories = {} self.coefficients = {} self.data_field_types = {} self.numeric_fields = {} self.bias = None self.missing_numerics = None self.c = None self.eps = None self.lr_normalize = None self.regularization = None if not (isinstance(logistic_regression, dict) and 'resource' in logistic_regression and logistic_regression['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.resource_id = get_logistic_regression_id(logistic_regression) if self.resource_id is None: raise Exception( api.error_message(logistic_regression, resource_type='logistic_regression', method='get')) query_string = ONLY_MODEL logistic_regression = retrieve_resource( api, self.resource_id, query_string=query_string) else: self.resource_id = get_logistic_regression_id(logistic_regression) if 'object' in logistic_regression and \ isinstance(logistic_regression['object'], dict): logistic_regression = logistic_regression['object'] try: self.input_fields = logistic_regression.get("input_fields", []) self.dataset_field_types = logistic_regression.get( "dataset_field_types", {}) objective_field = logistic_regression['objective_fields'] if \ logistic_regression['objective_fields'] else \ logistic_regression['objective_field'] except KeyError: raise ValueError("Failed to find the logistic regression expected " "JSON structure. Check your arguments.") if 'logistic_regression' in logistic_regression and \ isinstance(logistic_regression['logistic_regression'], dict): status = get_status(logistic_regression) if 'code' in status and status['code'] == FINISHED: logistic_regression_info = logistic_regression[ \ 'logistic_regression'] fields = logistic_regression_info.get('fields', {}) if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(self.fields.items(), key=lambda x: x[1].get("column_number"))] self.coefficients.update(logistic_regression_info.get( \ 'coefficients', [])) self.bias = logistic_regression_info.get('bias', 0) self.c = logistic_regression_info.get('c') self.eps = logistic_regression_info.get('eps') self.lr_normalize = logistic_regression_info.get('normalize') self.regularization = logistic_regression_info.get( \ 'regularization') # old models have no such attribute, so we set it to False in # this case self.missing_numerics = logistic_regression_info.get( \ 'missing_numerics', False) objective_id = extract_objective(objective_field) for field_id, field in fields.items(): if field['optype'] == 'text': self.term_forms[field_id] = {} self.term_forms[field_id].update( field['summary']['term_forms']) self.tag_clouds[field_id] = [] self.tag_clouds[field_id] = [tag for [tag, _] in field[ 'summary']['tag_cloud']] self.term_analysis[field_id] = {} self.term_analysis[field_id].update( field['term_analysis']) if field['optype'] == 'items': self.items[field_id] = [] self.items[field_id] = [item for item, _ in \ field['summary']['items']] self.item_analysis[field_id] = {} self.item_analysis[field_id].update( field['item_analysis']) if field['optype'] == 'categorical': self.categories[field_id] = [category for \ [category, _] in field['summary']['categories']] if self.missing_numerics and field['optype'] == 'numeric': self.numeric_fields[field_id] = True ModelFields.__init__( self, fields, objective_id=objective_id) self.map_coefficients() else: raise Exception("The logistic regression isn't finished yet") else: raise Exception("Cannot create the LogisticRegression instance." " Could not find the 'logistic_regression' key" " in the resource:\n\n%s" % logistic_regression)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.output dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [ label.strip() for label in args.labels.split(args.args_separator) ]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution( dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset(dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({ 'objective_field': args.objective_name_, 'objective_field_present': True }) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(args.retrieve_api_, models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if ensemble_ids and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = rens.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args): query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(rmod.set_publish_model_args(args)) if model_args: model = rmod.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # if we are using boosted ensembles to predict, activate boosting if model['object'].get('boosted_ensemble'): args.boosting = True # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if ensemble_ids: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields(model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [ field[1] for field in multi_label_fields ] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion(args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method == COMBINATION): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = rds.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = rbp.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and ensemble_ids and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] # local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)] if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file ) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({"objective_field": args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource(args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource(args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log ) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label, ) else: sys.exit( "The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories" ) # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if ( args.new_fields or (args.sample_rate != 1 and args.no_model) or (args.lisp_filter or args.json_filter) and not has_source(args) ): if fields is None: if isinstance(dataset, basestring): dataset = check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({"objective_field": args.objective_name_, "objective_field_present": True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, "max_categories", args.max_categories) other_label = get_metadata(dataset, "other_label", other_label) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource(args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model["resource"]] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource(args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble["object"]["models"][:] ensemble_ids = [ensemble["resource"]] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage="./storage"), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label, ) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring) or bigml.api.get_status(model)["code"] != bigml.api.FINISHED: if not args.evaluate and not a.has_train(args): query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model)): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and args.test_set: # If more than one model, use the full field structure if not single_model and not args.multi_label and belongs_to_ensemble(model): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data ) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, "max_categories", args.max_categories) other_label = get_metadata(model, "other_label", other_label) # If predicting if models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate: models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True )[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if ( args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION] ): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log ) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log ) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args(args, fields=fields, dataset_fields=test_fields) remote_predict( model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log, ) else: models_per_label = args.number_of_models if args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1: # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict( models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data, ) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r".*(model_[a-f0-9]{24})__predictions\.csv$", r"\1", args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def __init__(self, model, api=None): """The Model constructor can be given as first argument: - a model structure - a model id - a path to a JSON file containing a model structure """ self.resource_id = None self.ids_map = {} self.terms = {} # the string can be a path to a JSON file if isinstance(model, basestring): try: with open(model) as model_file: model = json.load(model_file) self.resource_id = get_model_id(model) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML model" " representation.") except IOError: # if it is not a path, it can be a model id self.resource_id = get_model_id(model) if self.resource_id is None: if model.find('model/') > -1: raise Exception( api.error_message(model, resource_type='model', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % model) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") # checks whether the information needed for local predictions is in # the first argument if isinstance(model, dict) and \ not check_model_fields(model): # if the fields used by the model are not # available, use only ID to retrieve it again model = get_model_id(model) self.resource_id = model if not (isinstance(model, dict) and 'resource' in model and model['resource'] is not None): if api is None: api = BigML(storage=STORAGE) query_string = ONLY_MODEL model = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_model_id(model) BaseModel.__init__(self, model, api=api) if 'object' in model and isinstance(model['object'], dict): model = model['object'] if 'model' in model and isinstance(model['model'], dict): status = get_status(model) if 'code' in status and status['code'] == FINISHED: distribution = model['model']['distribution']['training'] # will store global information in the tree: regression and # max_bins number tree_info = {'max_bins': 0} self.tree = Tree( model['model']['root'], self.fields, objective_field=self.objective_id, root_distribution=distribution, parent_id=None, ids_map=self.ids_map, tree_info=tree_info) self.tree.regression = tree_info['regression'] if self.tree.regression: self._max_bins = tree_info['max_bins'] else: raise Exception("The model isn't finished yet") else: raise Exception("Cannot create the Model instance. Could not" " find the 'model' key in the resource:\n\n%s" % model) if self.tree.regression: try: import numpy import scipy self.regression_ready = True except ImportError: self.regression_ready = False