def best_candidates_number(datasets_file, args, common_options, penalty=None, resume=False): """Selecting the best number of random candidates to be used in the ensemble construction """ loop_counter = 0 candidates_file = os.path.normpath(os.path.join(args.output_dir, CANDIDATES_LOG)) candidates_writer = UnicodeWriter(candidates_file).open_writer() candidates_writer.writerow(CANDIDATES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random")) max_candidates = args.max_candidates + 1 if args.nodes_step is None: args.nodes_step = DEFAULT_CANDIDATES_STEP random_candidates = args.min_candidates if penalty is None: penalty = DEFAULT_CANDIDATES_PENALTY best_score = - float('inf') metric = args.optimize score = best_score best_counter = 0 while random_candidates < max_candidates: loop_counter += 1 (score, metric_value, metric, resume) = candidates_evaluate(datasets_file, args, random_candidates, common_options, penalty=penalty, resume=resume, metric=metric) candidates_writer.writerow([ loop_counter, random_candidates, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_candidates = random_candidates best_score = score best_counter = loop_counter message = 'New best random candidates number is: %s\n' % \ best_candidates u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) random_candidates += DEFAULT_CANDIDATES_STEP if args.predictions_csv: resume = create_prediction_dataset(output_dir, "random%s" % best_counter, args, resume) message = ('The best random candidates number is: %s \n' % best_candidates) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) candidates_writer.close_writer() return best_candidates
def best_candidates_number(datasets_file, args, command_obj, penalty=None, resume=False): """Selecting the best number of random candidates to be used in the ensemble construction """ loop_counter = 0 candidates_file = os.path.normpath( os.path.join(args.output_dir, CANDIDATES_LOG)) candidates_writer = UnicodeWriter(candidates_file).open_writer() candidates_writer.writerow(CANDIDATES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random")) max_candidates = args.max_candidates + 1 if args.nodes_step is None: args.nodes_step = DEFAULT_CANDIDATES_STEP random_candidates = args.min_candidates if penalty is None: penalty = DEFAULT_CANDIDATES_PENALTY best_score = -float('inf') metric = args.optimize score = best_score best_counter = 0 while random_candidates < max_candidates: loop_counter += 1 (score, metric_value, metric, resume) = candidates_evaluate(datasets_file, args, random_candidates, command_obj, penalty=penalty, resume=resume, metric=metric) candidates_writer.writerow( [loop_counter, random_candidates, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_candidates = random_candidates best_score = score best_counter = loop_counter message = 'New best random candidates number is: %s\n' % \ best_candidates u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) random_candidates += DEFAULT_CANDIDATES_STEP if args.predictions_csv: resume = create_prediction_dataset(args.output_dir, "random%s" % best_counter, args, resume) message = ('The best random candidates number is: %s \n' % best_candidates) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) candidates_writer.close_writer() return best_candidates
def best_node_threshold(datasets_file, args, common_options, staleness=None, penalty=None, resume=False): """Selecting the node_limit to be used in the model construction """ loop_counter = 0 nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG)) nodes_writer = UnicodeWriter(nodes_file).open_writer() nodes_writer.writerow(NODES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "node_th")) max_nodes = args.max_nodes + 1 if args.min_nodes is None: args.min_nodes = DEFAULT_MIN_NODES if args.nodes_step is None: args.nodes_step = DEFAULT_NODES_STEP node_threshold = args.min_nodes if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_NODES_PENALTY best_score = - float('inf') best_unchanged_count = 0 metric = args.optimize score = best_score best_counter = 0 while best_unchanged_count < staleness and node_threshold < max_nodes: loop_counter += 1 (score, metric_value, metric, resume) = node_threshold_evaluate(datasets_file, args, node_threshold, common_options, penalty=penalty, resume=resume, metric=metric) nodes_writer.writerow([ loop_counter - 1, node_threshold, score, metric_value, best_score]) if (score - EPSILON) > best_score: best_threshold = node_threshold best_score = score best_unchanged_count = 0 best_counter = loop_counter message = 'New best node threshold: %s\n' % (best_threshold) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 node_threshold += args.nodes_step if args.predictions_csv: resume = create_prediction_dataset(output_dir, "node_th%s" % best_counter, args, resume) message = ('The best node threshold is: %s \n' % best_threshold) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) nodes_writer.close_writer() return best_threshold
def best_node_threshold(datasets_file, args, command_obj, staleness=None, penalty=None, resume=False): """Selecting the node_limit to be used in the model construction """ loop_counter = 0 nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG)) nodes_writer = UnicodeWriter(nodes_file).open_writer() nodes_writer.writerow(NODES_HEADER) args.output_dir = os.path.normpath(os.path.join(args.output_dir, "node_th")) max_nodes = args.max_nodes + 1 if args.min_nodes is None: args.min_nodes = DEFAULT_MIN_NODES if args.nodes_step is None: args.nodes_step = DEFAULT_NODES_STEP node_threshold = args.min_nodes if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_NODES_PENALTY best_score = -float('inf') best_unchanged_count = 0 metric = args.optimize score = best_score best_counter = 0 while best_unchanged_count < staleness and node_threshold < max_nodes: loop_counter += 1 (score, metric_value, metric, resume) = node_threshold_evaluate(datasets_file, args, node_threshold, command_obj, penalty=penalty, resume=resume, metric=metric) nodes_writer.writerow([ loop_counter - 1, node_threshold, score, metric_value, best_score ]) if (score - EPSILON) > best_score: best_threshold = node_threshold best_score = score best_unchanged_count = 0 best_counter = loop_counter message = 'New best node threshold: %s\n' % (best_threshold) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 node_threshold += args.nodes_step if args.predictions_csv: resume = create_prediction_dataset(args.output_dir, "node_th%s" % best_counter, args, resume) message = ('The best node threshold is: %s \n' % best_threshold) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) nodes_writer.close_writer() return best_threshold
def summary_csv(self, filename=None): """Summary of the contents of the fields """ summary = [] writer = None if filename is not None: writer = UnicodeWriter(filename).open_writer() writer.writerow(SUMMARY_HEADERS) else: summary.append(SUMMARY_HEADERS) for field_column in self.fields_columns: field_id = self.field_id(field_column) field = self.fields.get(field_id) field_summary = [] field_summary.append(field.get('column_number')) field_summary.append(field_id) field_summary.append(field.get('name')) field_summary.append(field.get('label')) field_summary.append(field.get('description')) field_summary.append(field.get('optype')) field_summary_value = field.get('summary', {}) if not field_summary_value: field_summary.append("") # no preferred info field_summary.append("") # no missing info field_summary.append("") # no error info field_summary.append("") # no content summary field_summary.append("") # no error summary else: field_summary.append(json.dumps(field.get('preferred'))) field_summary.append(field_summary_value.get("missing_count")) if self.field_errors and field_id in self.field_errors.keys(): errors = self.field_errors.get(field_id) field_summary.append(errors.get("total")) else: field_summary.append("0") if field['optype'] == 'numeric': field_summary.append("[%s, %s], mean: %s" % \ (field_summary_value.get("minimum"), field_summary_value.get("maximum"), field_summary_value.get("mean"))) elif field['optype'] == 'categorical': categories = field_summary_value.get("categories") field_summary.append( \ attribute_summary(categories, u"categorìes", limit=LIST_LIMIT)) elif field['optype'] == "text": terms = field_summary_value.get("tag_cloud") field_summary.append( \ attribute_summary(terms, u"terms", limit=LIST_LIMIT)) elif field['optype'] == "items": items = field_summary_value.get("items") field_summary.append( \ attribute_summary(items, u"items", limit=LIST_LIMIT)) else: field_summary.append("") if self.field_errors and field_id in self.field_errors.keys(): field_summary.append( \ attribute_summary(errors.get("sample"), u"errors", limit=None)) else: field_summary.append("") if writer: writer.writerow(field_summary) else: summary.append(field_summary) if writer is None: return summary else: writer.close_writer()
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0:2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name(model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0: 2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes