def topic_distribution(topic_models, fields, args, session_file=None): """Computes a topic distribution for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude, headers = use_prediction_headers(test_reader, fields, args) # Local topic distributions: Topic distributions are computed # locally using topic models' # method message = u.dated("Creating local topic distributions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_topic_distribution(topic_models, test_reader, output, args, exclude=exclude, headers=headers) test_reader.close()
def prediction(models, fields, args, session_file=None): """Computes a supervised model prediction for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, args.objective_field, quality="probability") # Local predictions: Predictions are computed locally message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_prediction(models, test_reader, output, args, exclude=exclude) test_reader.close()
def anomaly_score(anomalies, fields, args, session_file=None): """Computes an anomaly score for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args) # Local anomaly scores: Anomaly scores are computed locally using # the local anomaly detector method message = u.dated("Creating local anomaly scores.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_anomaly_score(anomalies, test_reader, output, args, exclude=exclude) test_reader.close()
def centroid(clusters, fields, args, session_file=None): """Computes a centroid for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args) # Local centroids: Centroids are computed locally using clusters' # centroids distances message = u.dated("Creating local centroids.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_centroid(clusters, test_reader, output, args, exclude=exclude) test_reader.close()
def centroid(clusters, fields, args, session_file=None): """Computes a centroid for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers( args.prediction_header, output, test_reader, fields, args) # Local centroids: Centroids are computed locally using clusters' # centroids distances message = u.dated("Creating local centroids.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_centroid(clusters, test_reader, output, args, exclude=exclude) test_reader.close()
def topic_distribution(topic_models, fields, args, session_file=None): """Computes a topic distribution for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude, headers = use_prediction_headers( test_reader, fields, args) # Local topic distributions: Topic distributions are computed # locally using topic models' # method message = u.dated("Creating local topic distributions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_topic_distribution(topic_models, test_reader, output, args, exclude=exclude, headers=headers) test_reader.close()
def lr_prediction(linear_regressions, fields, args, session_file=None): """Computes a linear regression prediction for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers( args.prediction_header, output, test_reader, fields, args, args.objective_field, quality="probability") # Local predictions: Predictions are computed locally message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_prediction(linear_regressions, test_reader, output, args, exclude=exclude) test_reader.close()
def projection(pca, fields, args, session_file=None): """Computes the projection for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.projections test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: local_pca, kwargs = _local_pca(pca, args) pca_headers = ["PC%s" % (i + 1) for i in \ range(0, len(local_pca.projection({})))] # columns to exclude if input_data is added to the projections field exclude = use_projection_headers( args.projection_header, output, test_reader, fields, args, pca_headers) # Local projection: Projections are computed locally message = u.dated("Creating local projections.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_projection(local_pca, kwargs, test_reader, output, args, exclude=exclude) test_reader.close()
def predict(models, fields, args, api=None, log=None, resume=False, session_file=None, labels=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Computes a prediction for each entry in the `test_set`. Predictions computed locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_set = args.test_set test_set_header = args.test_header objective_field = args.objective_field output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, objective_field, test_separator=args.test_separator) prediction_file = output output_path = u.check_dir(output) with UnicodeWriter(output) as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, objective_field) # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv # Predictions are computed individually only if no_batch flag is set if args.remote and args.no_batch and not args.multi_label: if args.ensemble is not None: remote_predict_ensemble(args.ensemble, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) else: remote_predict_models(models, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) return # Local predictions: Predictions are computed locally using models' # rules with MultiModel's predict method message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) options = {} if args.method == THRESHOLD_CODE: options.update(threshold=args.threshold) if args.threshold_class is None: local_model = Model(models[0]) # default class is the first class that appears in the dataset # objective field summary, which might be different from the # objective summary of each model becaus model are built with # sampling objective_field = local_model.objective_id distribution = local_model.tree.fields[objective_field][ \ "summary"]["categories"] args.threshold_class = distribution[0][0] options.update(category=args.threshold_class) # For a model we build a Model and for a small number of models, # we build a MultiModel using all of # the given models and issue a combined prediction if (len(models) <= args.max_batch_models \ and args.fast and \ not args.multi_label and args.max_categories == 0 \ and args.method != COMBINATION): local_predict(models, test_reader, output, args, options, exclude) elif args.boosting: local_predict(args.ensemble, test_reader, output, args, options, exclude) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: # Local predictions: predictions are computed locally using # models' rules with MultiModel's predict method and combined using # aggregation if the objective field is a multi-labelled field # or one of the available combination methods: plurality, # confidence weighted and probability weighted if args.multi_label: method = AGGREGATION elif args.max_categories > 0: method = COMBINATION else: method = args.method # For multi-labelled models, the --models flag keeps the order # of the labels and the models but the --model-tag flag # retrieves the models with no order, so the correspondence with # each label must be restored. ordered = True if args.multi_label and (args.model_tag is not None or models_per_label > 1): ordered = False local_batch_predict(models, test_reader, prediction_file, api, args, resume=resume, output_path=output_path, output=output, method=method, options=options, session_file=session_file, labels=labels, ordered=ordered, exclude=exclude, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) test_reader.close()
def predict(models, fields, args, api=None, log=None, resume=False, session_file=None, labels=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Computes a prediction for each entry in the `test_set`. Predictions computed locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_set = args.test_set test_set_header = args.test_header objective_field = args.objective_field output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, objective_field, test_separator=args.test_separator) prediction_file = output output_path = u.check_dir(output) with UnicodeWriter(output) as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers( args.prediction_header, output, test_reader, fields, args, objective_field) # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv # Predictions are computed individually only if no_batch flag is set if (args.remote and args.no_batch and not args.multi_label and args.method != THRESHOLD_CODE): if args.ensemble is not None: remote_predict_ensemble(args.ensemble, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) else: remote_predict_models(models, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) return # Local predictions: Predictions are computed locally using models' # rules with MultiModel's predict method message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) options = {} if args.method == THRESHOLD_CODE: options.update(threshold=args.threshold) if args.threshold_class is None: local_model = Model(models[0]) args.threshold_class = local_model.tree.distribution[0][0] options.update(category=args.threshold_class) # For a model we build a Model and for a small number of models, # we build a MultiModel using all of # the given models and issue a combined prediction if (len(models) <= args.max_batch_models and args.fast and not args.multi_label and args.max_categories == 0 and args.method != COMBINATION): local_predict(models, test_reader, output, args, options, exclude) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: # Local predictions: predictions are computed locally using # models' rules with MultiModel's predict method and combined using # aggregation if the objective field is a multi-labelled field # or one of the available combination methods: plurality, # confidence weighted and probability weighted if args.multi_label: method = AGGREGATION elif args.max_categories > 0: method = COMBINATION else: method = args.method # For multi-labelled models, the --models flag keeps the order # of the labels and the models but the --model-tag flag # retrieves the models with no order, so the correspondence with # each label must be restored. ordered = True if args.multi_label and (args.model_tag is not None or models_per_label > 1): ordered = False local_batch_predict(models, test_reader, prediction_file, api, args, resume=resume, output_path=output_path, output=output, method=method, options=options, session_file=session_file, labels=labels, ordered=ordered, exclude=exclude, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) test_reader.close()