Exemple #1
0
def best_candidates_number(datasets_file, args, common_options,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(os.path.join(args.output_dir,
                                                    CANDIDATES_LOG))
    candidates_writer = UnicodeWriter(candidates_file).open_writer()
    candidates_writer.writerow(CANDIDATES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "random"))
    max_candidates = args.max_candidates + 1

    if args.nodes_step is None:
        args.nodes_step = DEFAULT_CANDIDATES_STEP
    random_candidates = args.min_candidates

    if penalty is None:
        penalty = DEFAULT_CANDIDATES_PENALTY
    best_score = - float('inf')
    metric = args.optimize
    score = best_score
    best_counter = 0
    while random_candidates < max_candidates:
        loop_counter += 1
        (score,
         metric_value,
         metric,
         resume) = candidates_evaluate(datasets_file, args,
                                       random_candidates, common_options,
                                       penalty=penalty, resume=resume,
                                       metric=metric)
        candidates_writer.writerow([
            loop_counter, random_candidates, score, metric_value,
            best_score])
        if (score - EPSILON) > best_score:
            best_candidates = random_candidates
            best_score = score
            best_counter = loop_counter
            message = 'New best random candidates number is: %s\n' % \
                best_candidates
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value,
                                                      score)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
        random_candidates += DEFAULT_CANDIDATES_STEP
    if args.predictions_csv:
        resume = create_prediction_dataset(output_dir,
                                           "random%s" % best_counter,
                                           args, resume)
    message = ('The best random candidates number is: %s \n'
               % best_candidates)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    candidates_writer.close_writer()
    return best_candidates
Exemple #2
0
def best_candidates_number(datasets_file,
                           args,
                           command_obj,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(
        os.path.join(args.output_dir, CANDIDATES_LOG))
    candidates_writer = UnicodeWriter(candidates_file).open_writer()
    candidates_writer.writerow(CANDIDATES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random"))
    max_candidates = args.max_candidates + 1

    if args.nodes_step is None:
        args.nodes_step = DEFAULT_CANDIDATES_STEP
    random_candidates = args.min_candidates

    if penalty is None:
        penalty = DEFAULT_CANDIDATES_PENALTY
    best_score = -float('inf')
    metric = args.optimize
    score = best_score
    best_counter = 0
    while random_candidates < max_candidates:
        loop_counter += 1
        (score, metric_value, metric,
         resume) = candidates_evaluate(datasets_file,
                                       args,
                                       random_candidates,
                                       command_obj,
                                       penalty=penalty,
                                       resume=resume,
                                       metric=metric)
        candidates_writer.writerow(
            [loop_counter, random_candidates, score, metric_value, best_score])
        if (score - EPSILON) > best_score:
            best_candidates = random_candidates
            best_score = score
            best_counter = loop_counter
            message = 'New best random candidates number is: %s\n' % \
                best_candidates
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value, score)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
        random_candidates += DEFAULT_CANDIDATES_STEP
    if args.predictions_csv:
        resume = create_prediction_dataset(args.output_dir,
                                           "random%s" % best_counter, args,
                                           resume)
    message = ('The best random candidates number is: %s \n' % best_candidates)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    candidates_writer.close_writer()
    return best_candidates
Exemple #3
0
def best_node_threshold(datasets_file, args, common_options,
                        staleness=None, penalty=None,
                        resume=False):
    """Selecting the node_limit to be used in the model construction

    """
    loop_counter = 0
    nodes_file = os.path.normpath(os.path.join(args.output_dir,
                                               NODES_LOG))
    nodes_writer = UnicodeWriter(nodes_file).open_writer()
    nodes_writer.writerow(NODES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "node_th"))
    max_nodes = args.max_nodes + 1

    if args.min_nodes is None:
        args.min_nodes = DEFAULT_MIN_NODES
    if args.nodes_step is None:
        args.nodes_step = DEFAULT_NODES_STEP
    node_threshold = args.min_nodes
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_NODES_PENALTY
    best_score = - float('inf')
    best_unchanged_count = 0
    metric = args.optimize
    score = best_score
    best_counter = 0
    while best_unchanged_count < staleness and node_threshold < max_nodes:
        loop_counter += 1
        (score,
         metric_value,
         metric,
         resume) = node_threshold_evaluate(datasets_file, args,
                                           node_threshold, common_options,
                                           penalty=penalty, resume=resume,
                                           metric=metric)
        nodes_writer.writerow([
            loop_counter - 1, node_threshold, score, metric_value, best_score])
        if (score - EPSILON) > best_score:
            best_threshold = node_threshold
            best_score = score
            best_unchanged_count = 0
            best_counter = loop_counter
            message = 'New best node threshold: %s\n' % (best_threshold)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value,
                                                      score)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
        else:
            best_unchanged_count += 1
        node_threshold += args.nodes_step
    if args.predictions_csv:
        resume = create_prediction_dataset(output_dir,
                                           "node_th%s" % best_counter,
                                           args, resume)
    message = ('The best node threshold is: %s \n'
               % best_threshold)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    nodes_writer.close_writer()
    return best_threshold
Exemple #4
0
def best_node_threshold(datasets_file,
                        args,
                        command_obj,
                        staleness=None,
                        penalty=None,
                        resume=False):
    """Selecting the node_limit to be used in the model construction

    """
    loop_counter = 0
    nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG))
    nodes_writer = UnicodeWriter(nodes_file).open_writer()
    nodes_writer.writerow(NODES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "node_th"))
    max_nodes = args.max_nodes + 1

    if args.min_nodes is None:
        args.min_nodes = DEFAULT_MIN_NODES
    if args.nodes_step is None:
        args.nodes_step = DEFAULT_NODES_STEP
    node_threshold = args.min_nodes
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_NODES_PENALTY
    best_score = -float('inf')
    best_unchanged_count = 0
    metric = args.optimize
    score = best_score
    best_counter = 0
    while best_unchanged_count < staleness and node_threshold < max_nodes:
        loop_counter += 1
        (score, metric_value, metric,
         resume) = node_threshold_evaluate(datasets_file,
                                           args,
                                           node_threshold,
                                           command_obj,
                                           penalty=penalty,
                                           resume=resume,
                                           metric=metric)
        nodes_writer.writerow([
            loop_counter - 1, node_threshold, score, metric_value, best_score
        ])
        if (score - EPSILON) > best_score:
            best_threshold = node_threshold
            best_score = score
            best_unchanged_count = 0
            best_counter = loop_counter
            message = 'New best node threshold: %s\n' % (best_threshold)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value, score)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
        else:
            best_unchanged_count += 1
        node_threshold += args.nodes_step
    if args.predictions_csv:
        resume = create_prediction_dataset(args.output_dir,
                                           "node_th%s" % best_counter, args,
                                           resume)
    message = ('The best node threshold is: %s \n' % best_threshold)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    nodes_writer.close_writer()
    return best_threshold
Exemple #5
0
    def summary_csv(self, filename=None):
        """Summary of the contents of the fields

        """

        summary = []
        writer = None
        if filename is not None:
            writer = UnicodeWriter(filename).open_writer()
            writer.writerow(SUMMARY_HEADERS)
        else:
            summary.append(SUMMARY_HEADERS)

        for field_column in self.fields_columns:
            field_id = self.field_id(field_column)
            field = self.fields.get(field_id)
            field_summary = []
            field_summary.append(field.get('column_number'))
            field_summary.append(field_id)
            field_summary.append(field.get('name'))
            field_summary.append(field.get('label'))
            field_summary.append(field.get('description'))
            field_summary.append(field.get('optype'))
            field_summary_value = field.get('summary', {})

            if not field_summary_value:
                field_summary.append("")  # no preferred info
                field_summary.append("")  # no missing info
                field_summary.append("")  # no error info
                field_summary.append("")  # no content summary
                field_summary.append("")  # no error summary
            else:
                field_summary.append(json.dumps(field.get('preferred')))
                field_summary.append(field_summary_value.get("missing_count"))
                if self.field_errors and field_id in self.field_errors.keys():
                    errors = self.field_errors.get(field_id)
                    field_summary.append(errors.get("total"))
                else:
                    field_summary.append("0")
                if field['optype'] == 'numeric':
                    field_summary.append("[%s, %s], mean: %s" % \
                        (field_summary_value.get("minimum"),
                         field_summary_value.get("maximum"),
                         field_summary_value.get("mean")))
                elif field['optype'] == 'categorical':
                    categories = field_summary_value.get("categories")
                    field_summary.append( \
                        attribute_summary(categories, u"categorìes",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "text":
                    terms = field_summary_value.get("tag_cloud")
                    field_summary.append( \
                        attribute_summary(terms, u"terms",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "items":
                    items = field_summary_value.get("items")
                    field_summary.append( \
                        attribute_summary(items, u"items", limit=LIST_LIMIT))
                else:
                    field_summary.append("")
                if self.field_errors and field_id in self.field_errors.keys():
                    field_summary.append( \
                        attribute_summary(errors.get("sample"), u"errors",
                                          limit=None))
                else:
                    field_summary.append("")
            if writer:
                writer.writerow(field_summary)
            else:
                summary.append(field_summary)
        if writer is None:
            return summary
        else:
            writer.close_writer()
Exemple #6
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True,
                      use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0:2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
Exemple #7
0
    def summary_csv(self, filename=None):
        """Summary of the contents of the fields

        """

        summary = []
        writer = None
        if filename is not None:
            writer = UnicodeWriter(filename).open_writer()
            writer.writerow(SUMMARY_HEADERS)
        else:
            summary.append(SUMMARY_HEADERS)

        for field_column in self.fields_columns:
            field_id = self.field_id(field_column)
            field = self.fields.get(field_id)
            field_summary = []
            field_summary.append(field.get('column_number'))
            field_summary.append(field_id)
            field_summary.append(field.get('name'))
            field_summary.append(field.get('label'))
            field_summary.append(field.get('description'))
            field_summary.append(field.get('optype'))
            field_summary_value = field.get('summary', {})

            if not field_summary_value:
                field_summary.append("") # no preferred info
                field_summary.append("") # no missing info
                field_summary.append("") # no error info
                field_summary.append("") # no content summary
                field_summary.append("") # no error summary
            else:
                field_summary.append(json.dumps(field.get('preferred')))
                field_summary.append(field_summary_value.get("missing_count"))
                if self.field_errors and field_id in self.field_errors.keys():
                    errors = self.field_errors.get(field_id)
                    field_summary.append(errors.get("total"))
                else:
                    field_summary.append("0")
                if field['optype'] == 'numeric':
                    field_summary.append("[%s, %s], mean: %s" % \
                        (field_summary_value.get("minimum"),
                         field_summary_value.get("maximum"),
                         field_summary_value.get("mean")))
                elif field['optype'] == 'categorical':
                    categories = field_summary_value.get("categories")
                    field_summary.append( \
                        attribute_summary(categories, u"categorìes",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "text":
                    terms = field_summary_value.get("tag_cloud")
                    field_summary.append( \
                        attribute_summary(terms, u"terms",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "items":
                    items = field_summary_value.get("items")
                    field_summary.append( \
                        attribute_summary(items, u"items", limit=LIST_LIMIT))
                else:
                    field_summary.append("")
                if self.field_errors and field_id in self.field_errors.keys():
                    field_summary.append( \
                        attribute_summary(errors.get("sample"), u"errors",
                                          limit=None))
                else:
                    field_summary.append("")
            if writer:
                writer.writerow(field_summary)
            else:
                summary.append(field_summary)
        if writer is None:
            return summary
        else:
            writer.close_writer()
Exemple #8
0
    def batch_predict(self, input_data_list, output_file_path=None,
                      by_name=True, reuse=False,
                      missing_strategy=LAST_PREDICTION, headers=None,
                      to_file=True, use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list) and
                       headers is not None and
                       len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(model.resource_id,
                                                        output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0: 2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes