def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": args.tag } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def combine_votes(votes_files, to_prediction, to_file, method=0, prediction_info=NORMAL_FORMAT, input_data_list=None, exclude=None): """Combines the votes found in the votes' files and stores predictions. votes_files: should contain the list of file names to_prediction: is the Model method that casts prediction to numeric type if needed to_file: is the name of the final output file. """ votes = read_votes(votes_files, to_prediction) u.check_dir(to_file) with UnicodeWriter(to_file) as output: number_of_tests = len(votes) if input_data_list is None or len(input_data_list) != number_of_tests: input_data_list = None for index in range(0, number_of_tests): multivote = votes[index] input_data = (None if input_data_list is None else input_data_list[index]) write_prediction(multivote.combine(method, True), output, prediction_info, input_data, exclude)
def topic_distribution(topic_models, fields, args, session_file=None): """Computes a topic distribution for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude, headers = use_prediction_headers(test_reader, fields, args) # Local topic distributions: Topic distributions are computed # locally using topic models' # method message = u.dated("Creating local topic distributions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_topic_distribution(topic_models, test_reader, output, args, exclude=exclude, headers=headers) test_reader.close()
def best_first_search(datasets_file, api, args, command_obj, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath( os.path.join(args.output_dir, FEATURES_LOG)) features_writer = UnicodeWriter(features_file).open_writer() features_header = FEATURES_HEADER if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc))
def tree_CSV(self, file_name=None, leaves_only=False): """Outputs the node structure to a CSV file or array """ headers_names = [] if self.tree.regression: headers_names.append(self.fields[self.tree.objective_id]['name']) headers_names.append("error") for index in range(0, self._max_bins): headers_names.append("bin%s_value" % index) headers_names.append("bin%s_instances" % index) else: headers_names.append(self.fields[self.tree.objective_id]['name']) headers_names.append("confidence") headers_names.append("impurity") for category, _ in self.tree.distribution: headers_names.append(category) nodes_generator = self.get_nodes_info(headers_names, leaves_only=leaves_only) if file_name is not None: with UnicodeWriter(file_name) as writer: writer.writerow( [header.encode("utf-8") for header in headers_names]) for row in nodes_generator: writer.writerow([ item if not isinstance(item, basestring) else item.encode("utf-8") for item in row ]) else: rows = [] rows.append(headers_names) for row in nodes_generator: rows.append(row) return rows
def anomaly_score(anomalies, fields, args, session_file=None): """Computes an anomaly score for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args) # Local anomaly scores: Anomaly scores are computed locally using # the local anomaly detector method message = u.dated("Creating local anomaly scores.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_anomaly_score(anomalies, test_reader, output, args, exclude=exclude) test_reader.close()
def prediction(models, fields, args, session_file=None): """Computes a supervised model prediction for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, args.objective_field, quality="probability") # Local predictions: Predictions are computed locally message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_prediction(models, test_reader, output, args, exclude=exclude) test_reader.close()
def sample_file(sample, fields, args, api, path=None, session_file=None): """Creates a file for each sample with the sample rows. """ query_string = sample_query_string(args, fields) sample = r.get_samples([sample], args, api, session_file=session_file, query_string=query_string)[0][0] output = args.predictions with UnicodeWriter(output, lineterminator="\n") as output: headers = [ field['name'] for field in sample['object']['sample']['fields'] ] if args.sample_header: if args.row_index or args.occurrence: new_headers = [] if args.row_index: new_headers.append("index") if args.occurrence: new_headers.append("occurrences") new_headers.extend(headers) headers = new_headers output.writerow(headers) for row in sample['object']['sample']['rows']: output.writerow(row) if args.stat_field or args.stat_fields: stat_info = {} sample_obj = sample['object']['sample'] for key in STAT_KEYS: if key in sample_obj: stat_info[key] = sample_obj[key] with open(os.path.join(path, "stat_info.json"), "w") as stat_file: json.dump(stat_info, stat_file)
def statistics_csv(self, file_name=None): """Clusters statistic information in CSV format """ rows = [] writer = None field_ids = self.centroids[0].center.keys() headers = [u"Centroid_name"] headers.extend( [u"%s" % self.fields[field_id]["name"] for field_id in field_ids]) headers.extend([u"Instances"]) intercentroids = False header_complete = False centroids_list = sorted(self.centroids, key=lambda x: x.name) for centroid in centroids_list: row = [centroid.name] row.extend( self.centroid_features(centroid, field_ids, encode=False)) row.append(centroid.count) if len(self.centroids) > 1: for measure, result in self.centroids_distance(centroid): if not intercentroids: headers.append(u"%s intercentroid distance" % \ measure.title()) row.append(result) intercentroids = True for measure, result in centroid.distance.items(): if measure in CSV_STATISTICS: if not header_complete: headers.append(u"Distance %s" % measure.lower().replace("_", " ")) row.append(result) if not header_complete: rows.append(headers) header_complete = True rows.append(row) if self.cluster_global: row = [u"%s" % self.cluster_global.name] row.extend( self.centroid_features(self.cluster_global, field_ids, encode=False)) row.append(self.cluster_global.count) if len(self.centroids) > 1: for measure, result in self.cluster_global_distance(): row.append(result) for measure, result in self.cluster_global.distance.items(): if measure in CSV_STATISTICS: row.append(result) # header is already in rows then insert cluster_global after it rows.insert(1, row) if file_name is None: return rows with UnicodeWriter(file_name) as writer: writer.writerows(rows)
def rules_csv(self, file_name, **kwargs): """Stores the rules in CSV format in the user-given file. The rules can be previously selected using the arguments in get_rules """ rules = self.get_rules(**kwargs) rules = [self.describe(rule.to_csv()) for rule in rules] if file_name is None: raise ValueError("A valid file name is required to store the " "rules.") with UnicodeWriter(file_name, quoting=csv.QUOTE_NONNUMERIC) as writer: writer.writerow(RULE_HEADERS) for rule in rules: writer.writerow([item if not isinstance(item, str) else item.encode("utf-8") for item in rule])
def statistics_csv(self, file_name=None): """Clusters statistic information in CSV format """ rows = [] writer = None field_ids = self.centroids[0].center.keys() headers = [u"centroid_name"] headers.extend( [u"%s" % self.fields[field_id]["name"] for field_id in field_ids]) headers.extend([u"Instances"]) intercentroids = False header_complete = False for centroid in self.centroids: row = [centroid.name] row.extend(self.centroid_features(centroid, field_ids)) row.append(centroid.count) if len(self.centroids) > 1: for measure, result in self.centroids_distance(centroid): if not intercentroids: headers.append(u"Intercentroids %s" % measure.lower()) row.append(result) intercentroids = True for measure, result in centroid.distance.items(): if measure in CSV_STATISTICS: if not header_complete: headers.append(u"Data %s" % measure.lower().replace("_", " ")) row.append(result) if not header_complete: rows.append(headers) header_complete = True rows.append(row) if file_name is None: return rows with UnicodeWriter(file_name) as writer: for row in rows: writer.writerow([ item if not isinstance(item, basestring) else item.encode("utf-8") for item in row ])
def write_forecasts(forecast, output): """Writes the final forecast to the required output The function creates a new file per field used in the forecast input data. The id of the field will be appended to the name provided in the `output` parameter. """ for objective_id, forecast_value in forecast.items(): headers = [f["model"] for f in forecast_value] points = [] if not forecast_value: sys.exit("No forecasts available") for index in range(len(forecast_value[0]["point_forecast"])): points.append([f["point_forecast"][index] for f in forecast_value]) output_file = "%s_%s.csv" % (output, objective_id) with UnicodeWriter(output_file, lineterminator="\n") as out_handler: out_handler.writerow(headers) for row in points: out_handler.writerow(row)
def remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely and save predictions to file """ prediction_args = {"tags": args.tag, "combiner": args.method} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if (not resume or not c.checkpoint(c.are_predictions_created, prediction_file, test_reader.number_of_tests(), debug=args.debug)[0]): message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=args.verbosity) with UnicodeWriter(prediction_file) as predictions_file: for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(ensemble_id, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) prediction = u.check_resource(prediction, api.get_prediction) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction, args.prediction_info) write_prediction(prediction_row, predictions_file, args.prediction_info, input_data, exclude)
def centroid(clusters, fields, args, session_file=None): """Computes a centroid for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers( args.prediction_header, output, test_reader, fields, args) # Local centroids: Centroids are computed locally using clusters' # centroids distances message = u.dated("Creating local centroids.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_centroid(clusters, test_reader, output, args, exclude=exclude) test_reader.close()
def tree_csv(model, file_name=None, leaves_only=False): """Outputs the node structure to a CSV file or array """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") headers_names = [] if model.regression: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("error") max_bins = get_node(model.tree)[model.offsets["max_bins"]] for index in range(0, max_bins): headers_names.append("bin%s_value" % index) headers_names.append("bin%s_instances" % index) else: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("confidence") headers_names.append("impurity") node = get_node(model.tree) for category, _ in node[model.offsets["distribution"]]: headers_names.append(category) nodes_generator = get_nodes_info(model, headers_names, leaves_only=leaves_only) if file_name is not None: with UnicodeWriter(file_name) as writer: writer.writerow([utf8(header) for header in headers_names]) for row in nodes_generator: writer.writerow([ item if not isinstance(item, str) else utf8(item) for item in row ]) return file_name rows = [] rows.append(headers_names) for row in nodes_generator: rows.append(row) return rows
def projection(pca, fields, args, session_file=None): """Computes the projection for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.projections test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: local_pca, kwargs = _local_pca(pca, args) pca_headers = ["PC%s" % (i + 1) for i in \ range(0, len(local_pca.projection({})))] # columns to exclude if input_data is added to the projections field exclude = use_projection_headers( args.projection_header, output, test_reader, fields, args, pca_headers) # Local projection: Projections are computed locally message = u.dated("Creating local projections.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_projection(local_pca, kwargs, test_reader, output, args, exclude=exclude) test_reader.close()
def summary_csv(self, filename=None): """Summary of the contents of the fields """ summary = [] writer = None if filename is not None: writer = UnicodeWriter(filename).open_writer() writer.writerow(SUMMARY_HEADERS) else: summary.append(SUMMARY_HEADERS) for field_column in self.fields_columns: field_id = self.field_id(field_column) field = self.fields.get(field_id) field_summary = [] field_summary.append(field.get('column_number')) field_summary.append(field_id) field_summary.append(field.get('name')) field_summary.append(field.get('label')) field_summary.append(field.get('description')) field_summary.append(field.get('optype')) field_summary_value = field.get('summary', {}) if not field_summary_value: field_summary.append("") # no preferred info field_summary.append("") # no missing info field_summary.append("") # no error info field_summary.append("") # no content summary field_summary.append("") # no error summary else: field_summary.append(json.dumps(field.get('preferred'))) field_summary.append(field_summary_value.get("missing_count")) if self.field_errors and field_id in self.field_errors.keys(): errors = self.field_errors.get(field_id) field_summary.append(errors.get("total")) else: field_summary.append("0") if field['optype'] == 'numeric': field_summary.append("[%s, %s], mean: %s" % \ (field_summary_value.get("minimum"), field_summary_value.get("maximum"), field_summary_value.get("mean"))) elif field['optype'] == 'categorical': categories = field_summary_value.get("categories") field_summary.append( \ attribute_summary(categories, u"categorìes", limit=LIST_LIMIT)) elif field['optype'] == "text": terms = field_summary_value.get("tag_cloud") field_summary.append( \ attribute_summary(terms, u"terms", limit=LIST_LIMIT)) elif field['optype'] == "items": items = field_summary_value.get("items") field_summary.append( \ attribute_summary(items, u"items", limit=LIST_LIMIT)) else: field_summary.append("") if self.field_errors and field_id in self.field_errors.keys(): field_summary.append( \ attribute_summary(errors.get("sample"), u"errors", limit=None)) else: field_summary.append("") if writer: writer.writerow(field_summary) else: summary.append(field_summary) if writer is None: return summary else: writer.close_writer()
def best_candidates_number(datasets_file, args, common_options, penalty=None, resume=False): """Selecting the best number of random candidates to be used in the ensemble construction """ loop_counter = 0 candidates_file = os.path.normpath(os.path.join(args.output_dir, CANDIDATES_LOG)) candidates_writer = UnicodeWriter(candidates_file).open_writer() candidates_writer.writerow(CANDIDATES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random")) max_candidates = args.max_candidates + 1 if args.nodes_step is None: args.nodes_step = DEFAULT_CANDIDATES_STEP random_candidates = args.min_candidates if penalty is None: penalty = DEFAULT_CANDIDATES_PENALTY best_score = - float('inf') metric = args.optimize score = best_score best_counter = 0 while random_candidates < max_candidates: loop_counter += 1 (score, metric_value, metric, resume) = candidates_evaluate(datasets_file, args, random_candidates, common_options, penalty=penalty, resume=resume, metric=metric) candidates_writer.writerow([ loop_counter, random_candidates, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_candidates = random_candidates best_score = score best_counter = loop_counter message = 'New best random candidates number is: %s\n' % \ best_candidates u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) random_candidates += DEFAULT_CANDIDATES_STEP if args.predictions_csv: resume = create_prediction_dataset(output_dir, "random%s" % best_counter, args, resume) message = ('The best random candidates number is: %s \n' % best_candidates) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) candidates_writer.close_writer() return best_candidates
def best_node_threshold(datasets_file, args, common_options, staleness=None, penalty=None, resume=False): """Selecting the node_limit to be used in the model construction """ loop_counter = 0 nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG)) nodes_writer = UnicodeWriter(nodes_file).open_writer() nodes_writer.writerow(NODES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "node_th")) max_nodes = args.max_nodes + 1 if args.min_nodes is None: args.min_nodes = DEFAULT_MIN_NODES if args.nodes_step is None: args.nodes_step = DEFAULT_NODES_STEP node_threshold = args.min_nodes if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_NODES_PENALTY best_score = - float('inf') best_unchanged_count = 0 metric = args.optimize score = best_score best_counter = 0 while best_unchanged_count < staleness and node_threshold < max_nodes: loop_counter += 1 (score, metric_value, metric, resume) = node_threshold_evaluate(datasets_file, args, node_threshold, common_options, penalty=penalty, resume=resume, metric=metric) nodes_writer.writerow([ loop_counter - 1, node_threshold, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_threshold = node_threshold best_score = score best_unchanged_count = 0 best_counter = loop_counter message = 'New best node threshold: %s\n' % (best_threshold) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 node_threshold += args.nodes_step if args.predictions_csv: resume = create_prediction_dataset(output_dir, "node_th%s" % best_counter, args, resume) message = ('The best node threshold is: %s \n' % best_threshold) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) nodes_writer.close_writer() return best_threshold
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name(model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0: 2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
def best_candidates_number(datasets_file, args, command_obj, penalty=None, resume=False): """Selecting the best number of random candidates to be used in the ensemble construction """ loop_counter = 0 candidates_file = os.path.normpath( os.path.join(args.output_dir, CANDIDATES_LOG)) candidates_writer = UnicodeWriter(candidates_file).open_writer() candidates_writer.writerow(CANDIDATES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random")) max_candidates = args.max_candidates + 1 if args.nodes_step is None: args.nodes_step = DEFAULT_CANDIDATES_STEP random_candidates = args.min_candidates if penalty is None: penalty = DEFAULT_CANDIDATES_PENALTY best_score = -float('inf') metric = args.optimize score = best_score best_counter = 0 while random_candidates < max_candidates: loop_counter += 1 (score, metric_value, metric, resume) = candidates_evaluate(datasets_file, args, random_candidates, command_obj, penalty=penalty, resume=resume, metric=metric) candidates_writer.writerow( [loop_counter, random_candidates, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_candidates = random_candidates best_score = score best_counter = loop_counter message = 'New best random candidates number is: %s\n' % \ best_candidates u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) random_candidates += DEFAULT_CANDIDATES_STEP if args.predictions_csv: resume = create_prediction_dataset(args.output_dir, "random%s" % best_counter, args, resume) message = ('The best random candidates number is: %s \n' % best_candidates) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) candidates_writer.close_writer() return best_candidates
def best_node_threshold(datasets_file, args, command_obj, staleness=None, penalty=None, resume=False): """Selecting the node_limit to be used in the model construction """ loop_counter = 0 nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG)) nodes_writer = UnicodeWriter(nodes_file).open_writer() nodes_writer.writerow(NODES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "node_th")) max_nodes = args.max_nodes + 1 if args.min_nodes is None: args.min_nodes = DEFAULT_MIN_NODES if args.nodes_step is None: args.nodes_step = DEFAULT_NODES_STEP node_threshold = args.min_nodes if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_NODES_PENALTY best_score = -float('inf') best_unchanged_count = 0 metric = args.optimize score = best_score best_counter = 0 while best_unchanged_count < staleness and node_threshold < max_nodes: loop_counter += 1 (score, metric_value, metric, resume) = node_threshold_evaluate(datasets_file, args, node_threshold, command_obj, penalty=penalty, resume=resume, metric=metric) nodes_writer.writerow([ loop_counter - 1, node_threshold, score, metric_value, best_score ]) if (score - EPSILON) > best_score: best_threshold = node_threshold best_score = score best_unchanged_count = 0 best_counter = loop_counter message = 'New best node threshold: %s\n' % (best_threshold) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 node_threshold += args.nodes_step if args.predictions_csv: resume = create_prediction_dataset(args.output_dir, "node_th%s" % best_counter, args, resume) message = ('The best node threshold is: %s \n' % best_threshold) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) nodes_writer.close_writer() return best_threshold
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0:2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
def predict(models, fields, args, api=None, log=None, resume=False, session_file=None, labels=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Computes a prediction for each entry in the `test_set`. Predictions computed locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_set = args.test_set test_set_header = args.test_header objective_field = args.objective_field output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, objective_field, test_separator=args.test_separator) prediction_file = output output_path = u.check_dir(output) with UnicodeWriter(output) as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, objective_field) # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv # Predictions are computed individually only if no_batch flag is set if args.remote and args.no_batch and not args.multi_label: if args.ensemble is not None: remote_predict_ensemble(args.ensemble, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) else: remote_predict_models(models, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) return # Local predictions: Predictions are computed locally using models' # rules with MultiModel's predict method message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) options = {} if args.method == THRESHOLD_CODE: options.update(threshold=args.threshold) if args.threshold_class is None: local_model = Model(models[0]) # default class is the first class that appears in the dataset # objective field summary, which might be different from the # objective summary of each model becaus model are built with # sampling objective_field = local_model.objective_id distribution = local_model.tree.fields[objective_field][ \ "summary"]["categories"] args.threshold_class = distribution[0][0] options.update(category=args.threshold_class) # For a model we build a Model and for a small number of models, # we build a MultiModel using all of # the given models and issue a combined prediction if (len(models) <= args.max_batch_models \ and args.fast and \ not args.multi_label and args.max_categories == 0 \ and args.method != COMBINATION): local_predict(models, test_reader, output, args, options, exclude) elif args.boosting: local_predict(args.ensemble, test_reader, output, args, options, exclude) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: # Local predictions: predictions are computed locally using # models' rules with MultiModel's predict method and combined using # aggregation if the objective field is a multi-labelled field # or one of the available combination methods: plurality, # confidence weighted and probability weighted if args.multi_label: method = AGGREGATION elif args.max_categories > 0: method = COMBINATION else: method = args.method # For multi-labelled models, the --models flag keeps the order # of the labels and the models but the --model-tag flag # retrieves the models with no order, so the correspondence with # each label must be restored. ordered = True if args.multi_label and (args.model_tag is not None or models_per_label > 1): ordered = False local_batch_predict(models, test_reader, prediction_file, api, args, resume=resume, output_path=output_path, output=output, method=method, options=options, session_file=session_file, labels=labels, ordered=ordered, exclude=exclude, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) test_reader.close()
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": args.tag} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)