def main(args): print('initialize BigML API') if args.username and args.apikey: api = BigML(args.username,args.apikey) else: api = BigML() print('generate cross validation splits') cv_files = generate_cross_validation(args.filename,args.nfolds) cv_datasets = [] params = {'tags':[args.tag]} if args.objective_field >= 0: params['objective_field'] = {'id':'%06x' % args.objective_field} for (train_file,test_file) in cv_files: if args.sequential: # wait for source before creating dataset train_source = api.create_source(train_file,params) train_dataset = api.create_dataset(train_source,params) if api.ok(train_dataset): test_source = api.create_source(test_file,params) test_dataset = api.create_dataset(test_source,params) else: # upload sources in parallel and create datasets in parallel train_source = api.create_source(train_file,params) test_source = api.create_source(test_file,params) train_dataset = api.create_dataset(train_source,params) test_dataset = api.create_dataset(test_source,params) cv_datasets.append((train_dataset,test_dataset)) # don't pass objective field to model del(params['objective_field']) # wait for dataset creation to finish so we can find out the number of features dataset_res = api.check_resource(cv_datasets[0][0],api.get_dataset) dataset_obj = dataset_res['object'] # initial feature set field_ids = dataset_obj['fields'].keys() field_ids.remove(dataset_obj['objective_field']['id']) initial_state = [False for id in field_ids] # do best-first search done = False open_list = [(initial_state,0)] closed_list = [] best_accuracy = -1 best_unchanged_count = 0 while not done: (v,fv) = find_max_state(open_list) v_ids = [field_ids[i] for (i,val) in enumerate(v) if val] print('Max state is: %s\n Accuracy = %f' % (v_ids,fv)) closed_list.append((v,fv)) open_list.remove((v,fv)) if (fv - EPSILON) > best_accuracy: best_state = v best_accuracy = fv best_unchanged_count = 0 print('new best state') else: best_unchanged_count += 1 children = expand_state(v) for c in children: if (c not in [pair[0] for pair in open_list] and c not in [pair[0] for pair in closed_list]): input_fields = [id for (i,id) in enumerate(field_ids) if c[i]] print('Evaluating %s' % input_fields) params['input_fields'] = input_fields val = evaluate(cv_datasets,params,api,args.penalty,args.sequential) open_list.append((c,val)) if best_unchanged_count >= args.staleness: done = True best_features = [field_ids[i] for (i,val) in enumerate(best_state) if val] print('The best feature subset is: %s \n Accuracy = %0.2f%%' % (best_features,best_accuracy*100)) print('Evaluated %d/%d feature subsets' % ((len(open_list) + len(closed_list)),2**len(field_ids)))
class Cluster(ModelFields): """ A lightweight wrapper around a cluster model. Uses a BigML remote cluster model to build a local version that can be used to generate centroid predictions locally. """ def __init__(self, cluster, api=None): self.resource_id = None self.centroids = None self.cluster_global = None self.total_ss = None self.within_ss = None self.between_ss = None self.ratio_ss = None self.critical_value = None self.default_numeric_value = None self.k = None self.summary_fields = [] self.scales = {} self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.item_analysis = {} self.items = {} self.datasets = {} self.api = api if self.api is None: self.api = BigML(storage=STORAGE) self.resource_id, cluster = get_resource_dict( \ cluster, "cluster", api=api) if 'object' in cluster and isinstance(cluster['object'], dict): cluster = cluster['object'] if 'clusters' in cluster and isinstance(cluster['clusters'], dict): status = get_status(cluster) if 'code' in status and status['code'] == FINISHED: self.default_numeric_value = cluster.get( \ "default_numeric_value") self.summary_fields = cluster.get("summary_fields", []) self.datasets = cluster.get("cluster_datasets", {}) the_clusters = cluster['clusters'] cluster_global = the_clusters.get('global') clusters = the_clusters['clusters'] self.centroids = [Centroid(centroid) for centroid in clusters] self.cluster_global = cluster_global if cluster_global: self.cluster_global = Centroid(cluster_global) # "global" has no "name" and "count" then we set them self.cluster_global.name = GLOBAL_CLUSTER_LABEL self.cluster_global.count = \ self.cluster_global.distance['population'] self.total_ss = the_clusters.get('total_ss') self.within_ss = the_clusters.get('within_ss') if not self.within_ss: self.within_ss = sum(centroid.distance['sum_squares'] for centroid in self.centroids) self.between_ss = the_clusters.get('between_ss') self.ratio_ss = the_clusters.get('ratio_ss') self.critical_value = cluster.get('critical_value', None) self.k = cluster.get('k') self.scales.update(cluster['scales']) self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} fields = cluster['clusters']['fields'] summary_fields = cluster['summary_fields'] for field_id in summary_fields: try: del fields[field_id] except KeyError: # clusters retrieved from API will only contain # model fields pass for field_id, field in fields.items(): if field['optype'] == 'text': self.term_forms[field_id] = {} self.term_forms[field_id].update( field['summary']['term_forms']) self.tag_clouds[field_id] = {} self.tag_clouds[field_id].update( field['summary']['tag_cloud']) self.term_analysis[field_id] = {} self.term_analysis[field_id].update( field['term_analysis']) if field['optype'] == 'items': self.items[field_id] = {} self.items[field_id].update( dict(field['summary']['items'])) self.item_analysis[field_id] = {} self.item_analysis[field_id].update( field['item_analysis']) ModelFields.__init__(self, fields) if not all( [field_id in self.fields for field_id in self.scales]): raise Exception("Some fields are missing" " to generate a local cluster." " Please, provide a cluster with" " the complete list of fields.") else: raise Exception("The cluster isn't finished yet") else: raise Exception("Cannot create the Cluster instance. Could not" " find the 'clusters' key in the resource:\n\n%s" % cluster) def centroid(self, input_data): """Returns the id of the nearest centroid """ clean_input_data, unique_terms = self._prepare_for_distance( \ input_data) nearest = { 'centroid_id': None, 'centroid_name': None, 'distance': float('inf') } for centroid in self.centroids: distance2 = centroid.distance2(clean_input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = { 'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2 } nearest['distance'] = math.sqrt(nearest['distance']) return nearest @property def is_g_means(self): """Checks whether the cluster has been created using g-means """ return self.critical_value is not None def fill_numeric_defaults(self, input_data, average="mean"): """Checks whether input data is missing a numeric field and fills it with the average quantity provided in the ``average`` parameter """ for field_id, field in self.fields.items(): if (field_id not in self.summary_fields and \ field['optype'] == NUMERIC and field_id not in input_data): if average not in NUMERIC_DEFAULTS: raise ValueError("The available defaults are: %s" % \ ", ".join(NUMERIC_DEFAULTS)) default_value = 0 if average == "zero" \ else field['summary'].get(average) input_data[field_id] = default_value return input_data def get_unique_terms(self, input_data): """Parses the input data to find the list of unique terms in the tag cloud """ unique_terms = {} for field_id in self.term_forms: if field_id in input_data: input_data_field = input_data.get(field_id, '') if isinstance(input_data_field, basestring): case_sensitive = self.term_analysis[field_id].get( 'case_sensitive', True) token_mode = self.term_analysis[field_id].get( 'token_mode', 'all') if token_mode != TM_FULL_TERM: terms = parse_terms(input_data_field, case_sensitive=case_sensitive) else: terms = [] if token_mode != TM_TOKENS: terms.append(input_data_field if case_sensitive else input_data_field.lower()) unique_terms[field_id] = get_unique_terms( terms, self.term_forms[field_id], self.tag_clouds.get(field_id, [])) else: unique_terms[field_id] = input_data_field del input_data[field_id] # the same for items fields for field_id in self.item_analysis: if field_id in input_data: input_data_field = input_data.get(field_id, '') if isinstance(input_data_field, basestring): # parsing the items in input_data separator = self.item_analysis[field_id].get( 'separator', ' ') regexp = self.item_analysis[field_id].get( 'separator_regexp') if regexp is None: regexp = ur'%s' % re.escape(separator) terms = parse_items(input_data_field, regexp) unique_terms[field_id] = get_unique_terms( terms, {}, self.items.get(field_id, [])) else: unique_terms[field_id] = input_data_field del input_data[field_id] return unique_terms def centroids_distance(self, to_centroid): """Statistic distance information from the given centroid to the rest of centroids in the cluster """ intercentroid_distance = [] unique_terms = self.get_unique_terms(to_centroid.center) distances = [] for centroid in self.centroids: if centroid.centroid_id != to_centroid.centroid_id: distances.append( math.sqrt( centroid.distance2(to_centroid.center, unique_terms, self.scales))) for measure, function in INTERCENTROID_MEASURES: result = function(distances) intercentroid_distance.append([measure, result]) return intercentroid_distance def cluster_global_distance(self): """Used to populate the intercentroid distances columns in the CSV report. For now we don't want to compute real distance and jsut display "N/A" """ intercentroid_distance = [] for measure, _ in INTERCENTROID_MEASURES: intercentroid_distance.append([measure, 'N/A']) return intercentroid_distance def _prepare_for_distance(self, input_data): """Prepares the fields to be able to compute the distance2 """ # Checks and cleans input_data leaving the fields used in the model clean_input_data = self.filter_input_data(input_data) # Checks that all numeric fields are present in input data and # fills them with the default average (if given) when otherwise try: self.fill_numeric_defaults(clean_input_data, self.default_numeric_value) except ValueError: raise Exception("Missing values in input data. Input" " data must contain values for all " "numeric fields to compute a distance.") # Strips affixes for numeric values and casts to the final field type cast(clean_input_data, self.fields) unique_terms = self.get_unique_terms(clean_input_data) return clean_input_data, unique_terms def distances2_to_point(self, reference_point, list_of_points): """Computes the cluster square of the distance to an arbitrary reference point for a list of points. reference_point: (dict) The field values for the point used as reference list_of_points: (dict|Centroid) The field values or a Centroid object which contains these values """ # Checks and cleans input_data leaving the fields used in the model reference_point, _ = self._prepare_for_distance( \ reference_point) # mimic centroid structure to use it in distance computation point_info = {"center": reference_point} reference = Centroid(point_info) distances = [] for point in list_of_points: centroid_id = None if isinstance(point, Centroid): centroid_id = point.centroid_id point = point.center clean_point, unique_terms = self._prepare_for_distance( \ point) if clean_point != reference_point: result = {"data": point, "distance": reference.distance2( \ clean_point, unique_terms, self.scales)} if centroid_id is not None: result.update({"centroid_id": centroid_id}) distances.append(result) return distances def points_in_cluster(self, centroid_id): """Returns the list of data points that fall in one cluster. """ cluster_datasets = self.datasets centroid_dataset = cluster_datasets.get(centroid_id) if centroid_dataset in [None, ""]: centroid_dataset = self.api.create_dataset( \ self.resource_id, {"centroid": centroid_id}) self.api.ok(centroid_dataset) else: centroid_dataset = self.api.check_resource( \ "dataset/%s" % centroid_dataset) # download dataset to compute local predictions downloaded_data = self.api.download_dataset( \ centroid_dataset["resource"]) if PY3: text_reader = codecs.getreader("utf-8") downloaded_data = text_reader(downloaded_data) reader = csv.DictReader(downloaded_data) points = [] for row in reader: points.append(row) return points def closest_in_cluster(self, reference_point, number_of_points=None, centroid_id=None): """Computes the list of data points closer to a reference point. If no centroid_id information is provided, the points are chosen from the same cluster as the reference point. The points are returned in a list, sorted according to their distance to the reference point. The number_of_points parameter can be set to truncate the list to a maximum number of results. The response is a dictionary that contains the centroid id of the cluster plus the list of points """ if centroid_id is not None and centroid_id not in \ [centroid.centroid_id for centroid in self.centroids]: raise AttributeError( \ "Failed to find the provided centroid_id: %s" % centroid_id) if centroid_id is None: # finding the reference point cluster's centroid centroid_info = self.centroid(reference_point) centroid_id = centroid_info["centroid_id"] # reading the points that fall in the same cluster points = self.points_in_cluster(centroid_id) # computing distance to reference point points = self.distances2_to_point(reference_point, points) points = sorted(points, key=lambda x: x["distance"]) if number_of_points is not None: points = points[:number_of_points] for point in points: point["distance"] = math.sqrt(point["distance"]) return { "centroid_id": centroid_id, "reference": reference_point, "closest": points } def sorted_centroids(self, reference_point): """ Gives the list of centroids sorted according to its distance to an arbitrary reference point. """ close_centroids = self.distances2_to_point( \ reference_point, self.centroids) for centroid in close_centroids: centroid["distance"] = math.sqrt(centroid["distance"]) centroid["center"] = centroid["data"] del centroid["data"] return { "reference": reference_point, "centroids": sorted(close_centroids, key=lambda x: x["distance"]) } def centroid_features(self, centroid, field_ids, encode=True): """Returns features defining the centroid according to the list of common field ids that define the centroids. """ features = [] for field_id in field_ids: value = centroid.center[field_id] if isinstance(value, basestring) and encode: value = value.encode('utf-8') features.append(value) return features def get_data_distribution(self): """Returns training data distribution """ distribution = [[centroid.name, centroid.count] for centroid in self.centroids] return sorted(distribution, key=lambda x: x[0]) def print_global_distribution(self, out=sys.stdout): """Prints the line Global: 100% (<total> instances) """ output = u"" if self.cluster_global: output += (u" %s: 100%% (%d instances)\n" % (self.cluster_global.name, self.cluster_global.count)) out.write(output) out.flush() def print_ss_metrics(self, out=sys.stdout): """Prints the block of *_ss metrics from the cluster """ ss_metrics = [("total_ss (Total sum of squares)", self.total_ss), ("within_ss (Total within-cluster sum of the sum " "of squares)", self.within_ss), ("between_ss (Between sum of squares)", self.between_ss), ("ratio_ss (Ratio of sum of squares)", self.ratio_ss)] output = u"" for metric in ss_metrics: if metric[1]: output += (u"%s%s: %5f\n" % (INDENT, metric[0], metric[1])) out.write(output) out.flush() def statistics_csv(self, file_name=None): """Clusters statistic information in CSV format """ rows = [] writer = None field_ids = self.centroids[0].center.keys() headers = [u"Centroid_name"] headers.extend( [u"%s" % self.fields[field_id]["name"] for field_id in field_ids]) headers.extend([u"Instances"]) intercentroids = False header_complete = False centroids_list = sorted(self.centroids, key=lambda x: x.name) for centroid in centroids_list: row = [centroid.name] row.extend( self.centroid_features(centroid, field_ids, encode=False)) row.append(centroid.count) if len(self.centroids) > 1: for measure, result in self.centroids_distance(centroid): if not intercentroids: headers.append(u"%s intercentroid distance" % \ measure.title()) row.append(result) intercentroids = True for measure, result in centroid.distance.items(): if measure in CSV_STATISTICS: if not header_complete: headers.append(u"Distance %s" % measure.lower().replace("_", " ")) row.append(result) if not header_complete: rows.append(headers) header_complete = True rows.append(row) if self.cluster_global: row = [u"%s" % self.cluster_global.name] row.extend( self.centroid_features(self.cluster_global, field_ids, encode=False)) row.append(self.cluster_global.count) if len(self.centroids) > 1: for measure, result in self.cluster_global_distance(): row.append(result) for measure, result in self.cluster_global.distance.items(): if measure in CSV_STATISTICS: row.append(result) # header is already in rows then insert cluster_global after it rows.insert(1, row) if file_name is None: return rows with UnicodeWriter(file_name) as writer: writer.writerows(rows) def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ report_header = '' if self.is_g_means: report_header = \ u'G-means Cluster (critical_value=%d)' % self.critical_value else: report_header = u'K-means Cluster (k=%d)' % self.k out.write(report_header + ' with %d centroids\n\n' % len(self.centroids)) out.write(u"Data distribution:\n") # "Global" is set as first entry self.print_global_distribution(out=out) print_distribution(self.get_data_distribution(), out=out) out.write(u"\n") centroids_list = [self.cluster_global] if self.cluster_global else [] centroids_list.extend(sorted(self.centroids, key=lambda x: x.name)) out.write(u"Cluster metrics:\n") self.print_ss_metrics(out=out) out.write(u"\n") out.write(u"Centroids:\n") for centroid in centroids_list: out.write(utf8(u"\n%s%s: " % (INDENT, centroid.name))) connector = "" for field_id, value in centroid.center.items(): if isinstance(value, basestring): value = u"\"%s\"" % value out.write( utf8(u"%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write(u"\n\n") out.write(u"Distance distribution:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) out.write(u"\n") if len(self.centroids) > 1: out.write(u"Intercentroid distance:\n\n") centroids_list = (centroids_list[1:] if self.cluster_global else centroids_list) for centroid in centroids_list: out.write( utf8(u"%sTo centroid: %s\n" % (INDENT, centroid.name))) for measure, result in self.centroids_distance(centroid): out.write(u"%s%s: %s\n" % (INDENT * 2, measure, result)) out.write(u"\n")
class Cluster(ModelFields): """ A lightweight wrapper around a cluster model. Uses a BigML remote cluster model to build a local version that can be used to generate centroid predictions locally. """ def __init__(self, cluster, api=None): self.resource_id = None self.centroids = None self.cluster_global = None self.total_ss = None self.within_ss = None self.between_ss = None self.ratio_ss = None self.critical_value = None self.default_numeric_value = None self.k = None self.summary_fields = [] self.scales = {} self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.item_analysis = {} self.items = {} self.datasets = {} self.api = api # checks whether the information needed for local predictions is in # the first argument if isinstance(cluster, dict) and \ not check_model_fields(cluster): # if the fields used by the cluster are not # available, use only ID to retrieve it again cluster = get_cluster_id(cluster) self.resource_id = cluster if not (isinstance(cluster, dict) and 'resource' in cluster and cluster['resource'] is not None): if api is None: api = BigML(storage=STORAGE) self.api = api self.resource_id = get_cluster_id(cluster) if self.resource_id is None: raise Exception(api.error_message(cluster, resource_type='cluster', method='get')) query_string = ONLY_MODEL cluster = retrieve_resource(api, self.resource_id, query_string=query_string) else: self.resource_id = get_cluster_id(cluster) if 'object' in cluster and isinstance(cluster['object'], dict): cluster = cluster['object'] if 'clusters' in cluster and isinstance(cluster['clusters'], dict): status = get_status(cluster) if 'code' in status and status['code'] == FINISHED: self.default_numeric_value = cluster.get( \ "default_numeric_value") self.summary_fields = cluster.get("summary_fields", []) self.datasets = cluster.get("cluster_datasets", {}) the_clusters = cluster['clusters'] cluster_global = the_clusters.get('global') clusters = the_clusters['clusters'] self.centroids = [Centroid(centroid) for centroid in clusters] self.cluster_global = cluster_global if cluster_global: self.cluster_global = Centroid(cluster_global) # "global" has no "name" and "count" then we set them self.cluster_global.name = GLOBAL_CLUSTER_LABEL self.cluster_global.count = \ self.cluster_global.distance['population'] self.total_ss = the_clusters.get('total_ss') self.within_ss = the_clusters.get('within_ss') if not self.within_ss: self.within_ss = sum(centroid.distance['sum_squares'] for centroid in self.centroids) self.between_ss = the_clusters.get('between_ss') self.ratio_ss = the_clusters.get('ratio_ss') self.critical_value = cluster.get('critical_value', None) self.k = cluster.get('k') self.scales.update(cluster['scales']) self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} fields = cluster['clusters']['fields'] summary_fields = cluster['summary_fields'] for field_id in summary_fields: del fields[field_id] for field_id, field in fields.items(): if field['optype'] == 'text': self.term_forms[field_id] = {} self.term_forms[field_id].update(field[ 'summary']['term_forms']) self.tag_clouds[field_id] = {} self.tag_clouds[field_id].update(field[ 'summary']['tag_cloud']) self.term_analysis[field_id] = {} self.term_analysis[field_id].update( field['term_analysis']) if field['optype'] == 'items': self.items[field_id] = {} self.items[field_id].update( dict(field['summary']['items'])) self.item_analysis[field_id] = {} self.item_analysis[field_id].update( field['item_analysis']) ModelFields.__init__(self, fields) if not all([field_id in self.fields for field_id in self.scales]): raise Exception("Some fields are missing" " to generate a local cluster." " Please, provide a cluster with" " the complete list of fields.") else: raise Exception("The cluster isn't finished yet") else: raise Exception("Cannot create the Cluster instance. Could not" " find the 'clusters' key in the resource:\n\n%s" % cluster) def centroid(self, input_data, by_name=True): """Returns the id of the nearest centroid """ clean_input_data, unique_terms = self._prepare_for_distance( \ input_data, by_name=by_name) nearest = {'centroid_id': None, 'centroid_name': None, 'distance': float('inf')} for centroid in self.centroids: distance2 = centroid.distance2(clean_input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = {'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2} nearest['distance'] = math.sqrt(nearest['distance']) return nearest @property def is_g_means(self): """Checks whether the cluster has been created using g-means """ return self.critical_value is not None def fill_numeric_defaults(self, input_data, average="mean"): """Checks whether input data is missing a numeric field and fills it with the average quantity provided in the ``average`` parameter """ for field_id, field in self.fields.items(): if (field_id not in self.summary_fields and \ field['optype'] not in OPTIONAL_FIELDS and field_id not in input_data): if average not in NUMERIC_DEFAULTS: raise ValueError("The available defaults are: %s" % \ ", ".join(NUMERIC_DEFAULTS)) default_value = 0 if average == "zero" \ else field['summary'].get(average) input_data[field_id] = default_value return input_data def get_unique_terms(self, input_data): """Parses the input data to find the list of unique terms in the tag cloud """ unique_terms = {} for field_id in self.term_forms: if field_id in input_data: input_data_field = input_data.get(field_id, '') if isinstance(input_data_field, basestring): case_sensitive = self.term_analysis[field_id].get( 'case_sensitive', True) token_mode = self.term_analysis[field_id].get( 'token_mode', 'all') if token_mode != TM_FULL_TERM: terms = parse_terms(input_data_field, case_sensitive=case_sensitive) else: terms = [] if token_mode != TM_TOKENS: terms.append( input_data_field if case_sensitive else input_data_field.lower()) unique_terms[field_id] = get_unique_terms( terms, self.term_forms[field_id], self.tag_clouds.get(field_id, [])) else: unique_terms[field_id] = input_data_field del input_data[field_id] # the same for items fields for field_id in self.item_analysis: if field_id in input_data: input_data_field = input_data.get(field_id, '') if isinstance(input_data_field, basestring): # parsing the items in input_data separator = self.item_analysis[field_id].get( 'separator', ' ') regexp = self.item_analysis[field_id].get( 'separator_regexp') if regexp is None: regexp = ur'%s' % re.escape(separator) terms = parse_items(input_data_field, regexp) unique_terms[field_id] = get_unique_terms( terms, {}, self.items.get(field_id, [])) else: unique_terms[field_id] = input_data_field del input_data[field_id] return unique_terms def centroids_distance(self, to_centroid): """Statistic distance information from the given centroid to the rest of centroids in the cluster """ intercentroid_distance = [] unique_terms = self.get_unique_terms(to_centroid.center) distances = [] for centroid in self.centroids: if centroid.centroid_id != to_centroid.centroid_id: distances.append( math.sqrt( centroid.distance2(to_centroid.center, unique_terms, self.scales))) for measure, function in INTERCENTROID_MEASURES: result = function(distances) intercentroid_distance.append([measure, result]) return intercentroid_distance def cluster_global_distance(self): """Used to populate the intercentroid distances columns in the CSV report. For now we don't want to compute real distance and jsut display "N/A" """ intercentroid_distance = [] for measure, _ in INTERCENTROID_MEASURES: intercentroid_distance.append([measure, 'N/A']) return intercentroid_distance def _prepare_for_distance(self, input_data, by_name=True): """Prepares the fields to be able to compute the distance2 """ # Checks and cleans input_data leaving the fields used in the model clean_input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data and # fills them with the default average (if given) when otherwise try: self.fill_numeric_defaults(clean_input_data, self.default_numeric_value) except ValueError: raise Exception("Missing values in input data. Input" " data must contain values for all " "numeric fields to compute a distance.") # Strips affixes for numeric values and casts to the final field type cast(clean_input_data, self.fields) unique_terms = self.get_unique_terms(clean_input_data) return clean_input_data, unique_terms def distances2_to_point(self, reference_point, list_of_points, by_name=True): """Computes the cluster square of the distance to an arbitrary reference point for a list of points. reference_point: (dict) The field values for the point used as reference list_of_points: (dict|Centroid) The field values or a Centroid object which contains these values by_name: (boolean) Set if the dict information is keyed by field name. Expects IDs as keys otherwise. """ # Checks and cleans input_data leaving the fields used in the model reference_point, _ = self._prepare_for_distance( \ reference_point, by_name=by_name) # mimic centroid structure to use it in distance computation point_info = {"center": reference_point} reference = Centroid(point_info) distances = [] for point in list_of_points: centroid_id = None if isinstance(point, Centroid): centroid_id = point.centroid_id point = point.center by_name = False clean_point, unique_terms = self._prepare_for_distance( \ point, by_name=by_name) if clean_point != reference_point: result = {"data": point, "distance": reference.distance2( \ clean_point, unique_terms, self.scales)} if centroid_id is not None: result.update({"centroid_id": centroid_id}) distances.append(result) return distances def points_in_cluster(self, centroid_id): """Returns the list of data points that fall in one cluster. """ cluster_datasets = self.datasets centroid_dataset = cluster_datasets.get(centroid_id) if self.api is None: self.api = BigML(storage=STORAGE) if centroid_dataset in [None, ""]: centroid_dataset = self.api.create_dataset( \ self.resource_id, {"centroid": centroid_id}) self.api.ok(centroid_dataset) else: centroid_dataset = self.api.check_resource( \ "dataset/%s" % centroid_dataset) # download dataset to compute local predictions downloaded_data = self.api.download_dataset( \ centroid_dataset["resource"]) if PY3: text_reader = codecs.getreader("utf-8") downloaded_data = text_reader(downloaded_data) reader = csv.DictReader(downloaded_data) points = [] for row in reader: points.append(row) return points def closest_in_cluster(self, reference_point, number_of_points=None, centroid_id=None, by_name=True): """Computes the list of data points closer to a reference point. If no centroid_id information is provided, the points are chosen from the same cluster as the reference point. The points are returned in a list, sorted according to their distance to the reference point. The number_of_points parameter can be set to truncate the list to a maximum number of results. The response is a dictionary that contains the centroid id of the cluster plus the list of points """ if centroid_id is not None and centroid_id not in \ [centroid.centroid_id for centroid in self.centroids]: raise AttributeError( \ "Failed to find the provided centroid_id: %s" % centroid_id) if centroid_id is None: # finding the reference point cluster's centroid centroid_info = self.centroid(reference_point, by_name=True) centroid_id = centroid_info["centroid_id"] # reading the points that fall in the same cluster points = self.points_in_cluster(centroid_id) # computing distance to reference point points = self.distances2_to_point(reference_point, points) points = sorted(points, key=lambda x: x["distance"]) if number_of_points is not None: points = points[:number_of_points] for point in points: point["distance"] = math.sqrt(point["distance"]) return {"centroid_id": centroid_id, "reference": reference_point, "closest": points} def sorted_centroids(self, reference_point, by_name=True): """ Gives the list of centroids sorted according to its distance to an arbitrary reference point. """ close_centroids = self.distances2_to_point( \ reference_point, self.centroids, by_name=by_name) for centroid in close_centroids: centroid["distance"] = math.sqrt(centroid["distance"]) centroid["center"] = centroid["data"] del centroid["data"] return {"reference": reference_point, "centroids": sorted(close_centroids, key=lambda x: x["distance"])} def centroid_features(self, centroid, field_ids): """Returns features defining the centroid according to the list of common field ids that define the centroids. """ features = [] for field_id in field_ids: value = centroid.center[field_id] if isinstance(value, basestring): value = value.encode('utf-8') features.append(value) return features def get_data_distribution(self): """Returns training data distribution """ distribution = [[centroid.name, centroid.count] for centroid in self.centroids] return sorted(distribution, key=lambda x: x[0]) def print_global_distribution(self, out=sys.stdout): """Prints the line Global: 100% (<total> instances) """ output = u"" if self.cluster_global: output += (u" %s: 100%% (%d instances)\n" % ( self.cluster_global.name, self.cluster_global.count)) out.write(output) out.flush() def print_ss_metrics(self, out=sys.stdout): """Prints the block of *_ss metrics from the cluster """ ss_metrics = [("total_ss (Total sum of squares)", self.total_ss), ("within_ss (Total within-cluster sum of the sum " "of squares)", self.within_ss), ("between_ss (Between sum of squares)", self.between_ss), ("ratio_ss (Ratio of sum of squares)", self.ratio_ss)] output = u"" for metric in ss_metrics: if metric[1]: output += (u"%s%s: %5f\n" % (INDENT, metric[0], metric[1])) out.write(output) out.flush() def statistics_csv(self, file_name=None): """Clusters statistic information in CSV format """ rows = [] writer = None field_ids = self.centroids[0].center.keys() headers = [u"Centroid_name"] headers.extend([u"%s" % self.fields[field_id]["name"] for field_id in field_ids]) headers.extend([u"Instances"]) intercentroids = False header_complete = False centroids_list = sorted(self.centroids, key=lambda x: x.name) for centroid in centroids_list: row = [centroid.name] row.extend(self.centroid_features(centroid, field_ids)) row.append(centroid.count) if len(self.centroids) > 1: for measure, result in self.centroids_distance(centroid): if not intercentroids: headers.append(u"%s intercentroid distance" % \ measure.title()) row.append(result) intercentroids = True for measure, result in centroid.distance.items(): if measure in CSV_STATISTICS: if not header_complete: headers.append(u"Distance %s" % measure.lower().replace("_", " ")) row.append(result) if not header_complete: rows.append(headers) header_complete = True rows.append(row) if self.cluster_global: row = [u"%s" % self.cluster_global.name] row.extend(self.centroid_features(self.cluster_global, field_ids)) row.append(self.cluster_global.count) if len(self.centroids) > 1: for measure, result in self.cluster_global_distance(): row.append(result) for measure, result in self.cluster_global.distance.items(): if measure in CSV_STATISTICS: row.append(result) # header is already in rows then insert cluster_global after it rows.insert(1, row) if file_name is None: return rows with UnicodeWriter(file_name) as writer: for row in rows: writer.writerow([item if not isinstance(item, basestring) else item.encode("utf-8") for item in row]) def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ report_header = '' if self.is_g_means: report_header = \ u'G-means Cluster (critical_value=%d)' % self.critical_value else: report_header = u'K-means Cluster (k=%d)' % self.k out.write(report_header + ' with %d centroids\n\n' % len(self.centroids)) out.write(u"Data distribution:\n") # "Global" is set as first entry self.print_global_distribution(out=out) print_distribution(self.get_data_distribution(), out=out) out.write(u"\n") centroids_list = [self.cluster_global] if self.cluster_global else [] centroids_list.extend(sorted(self.centroids, key=lambda x: x.name)) out.write(u"Cluster metrics:\n") self.print_ss_metrics(out=out) out.write(u"\n") out.write(u"Centroids:\n") for centroid in centroids_list: out.write(utf8(u"\n%s%s: " % (INDENT, centroid.name))) connector = "" for field_id, value in centroid.center.items(): if isinstance(value, basestring): value = u"\"%s\"" % value out.write(utf8(u"%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write(u"\n\n") out.write(u"Distance distribution:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) out.write(u"\n") if len(self.centroids) > 1: out.write(u"Intercentroid distance:\n\n") centroids_list = (centroids_list[1:] if self.cluster_global else centroids_list) for centroid in centroids_list: out.write(utf8(u"%sTo centroid: %s\n" % (INDENT, centroid.name))) for measure, result in self.centroids_distance(centroid): out.write(u"%s%s: %s\n" % (INDENT * 2, measure, result)) out.write(u"\n")