def __init__(self, filename): """ Reads metrics csv into an array which can be indexed using the header_to_col and disease_to_row dictionaries. """ # Read metrics into dictionaries with open(filename, 'r') as metrics_file: metrics_reader = csv.DictReader(metrics_file) disease_to_metrics = { row["Disease ID"]: row for row in metrics_reader } self.header_to_col = { name: i for i, name in enumerate(metrics_reader.fieldnames[2:]) } self.disease_to_row = { id: i for i, id in enumerate(disease_to_metrics.keys()) } # Build metrics array self.metrics = np.zeros( (len(disease_to_metrics), len(self.header_to_col))) for disease, metrics in disease_to_metrics.items(): for header, metric in metrics.items(): if (header in self.header_to_col): self.metrics[self.disease_to_row[disease], self.header_to_col[header]] = metric
def log_run(split: str, epoch: int, writer: tensorboard.SummaryWriter, label_names: Sequence[str], metrics: MutableMapping[str, float], heaps: Optional[Mapping[str, Mapping[int, list[HeapItem]]]], cm: np.ndarray) -> None: """Logs the outputs (metrics, confusion matrix, tp/fp/fn images) from a single epoch run to Tensorboard. Args: metrics: dict, keys already prefixed with {split}/ """ per_label_recall = recall_from_confusion_matrix(cm, label_names) metrics.update(prefix_all_keys(per_label_recall, f'{split}/label_recall/')) # log metrics for metric, value in metrics.items(): writer.add_scalar(metric, value, epoch) # log confusion matrix cm_fig = plot_utils.plot_confusion_matrix(cm, classes=label_names, normalize=True) cm_fig_img = fig_to_img(cm_fig) writer.add_image(tag=f'confusion_matrix/{split}', img_tensor=cm_fig_img, global_step=epoch, dataformats='HWC') # log tp/fp/fn images if heaps is not None: for heap_type, heap_dict in heaps.items(): log_images_with_confidence(writer, heap_dict, label_names, epoch=epoch, tag=f'{split}/{heap_type}') writer.flush()
def log_run(split: str, epoch: int, writer: tf.summary.SummaryWriter, label_names: Sequence[str], metrics: MutableMapping[str, float], heaps: Mapping[str, Mapping[int, List[HeapItem]]], cm: np.ndarray) -> None: """Logs the outputs (metrics, confusion matrix, tp/fp/fn images) from a single epoch run to Tensorboard. Args: metrics: dict, keys already prefixed with {split}/ """ per_class_recall = recall_from_confusion_matrix(cm, label_names) metrics.update(prefix_all_keys(per_class_recall, f'{split}/label_recall/')) # log metrics for metric, value in metrics.items(): tf.summary.scalar(metric, value, epoch) # log confusion matrix cm_fig = plot_utils.plot_confusion_matrix(cm, classes=label_names, normalize=True) cm_fig_img = tf.convert_to_tensor(fig_to_img(cm_fig)[np.newaxis, ...]) tf.summary.image(f'confusion_matrix/{split}', cm_fig_img, step=epoch) # log tp/fp/fn images for heap_type, heap_dict in heaps.items(): log_images_with_confidence(heap_dict, label_names, epoch=epoch, tag=f'{split}/{heap_type}') writer.flush()
def test(args, model, loader, prefix='', verbose=True): print("train: Beginning test") loss_fn = loss_fns.DetectorLossFn().to(args.device) metrics = defaultdict(list) with torch.no_grad(): for (img, det) in tqdm(loader): batch_size = img.shape[0] img = img.to(args.device).expand(batch_size, 3, *(img.shape[2:])) det = det.to(args.device) det_hat = model(img) loss, loss_dict = loss_fn(det_hat[0], det_hat[1], det) metrics['loss'].append(loss.item()) for k, v in loss_dict.items(): metrics[k].append(np.mean([v])) for k, v in get_metrics(det, det_hat).items(): metrics[k].append(v) for k in replace_metric_by_mean: metrics[k] = np.mean(metrics[k]) # Print! if verbose: start_string = '#### {} evaluation ####'.format(prefix) print(start_string) for k, v in metrics.items(): print('#### {} = {}'.format(k, v[-1])) print(''.join(['#' for _ in range(len(start_string))])) return metrics
def calc_ref_metrics(truer, quants, est_col, tru_col='cnt'): truth = pd.read_table(truer) truth.columns = ['id', 'cnt'] estimated = pd.read_table(quants) estimated['id'] = estimated['Name'].str.split('|').str[0] est = estimated.groupby('id')[est_col].sum().reset_index() metrics = { "R^2" : sklearn.metrics.r2_score, \ "Explained Var." : sklearn.metrics.explained_variance_score, \ "Mean Abs Error" : sklearn.metrics.mean_absolute_error, \ "Mean Sq. Error" : sklearn.metrics.mean_squared_error, \ "Mean Sq. Log. Error" : sklearn.metrics.mean_squared_log_error, \ "Med Abs Error" : sklearn.metrics.median_absolute_error, \ "Bray Curtis" : scipy.spatial.distance.braycurtis, \ "Kendall Tau" : scipy.stats.kendalltau, \ "Cosine Similarity" : scipy.spatial.distance.cosine, \ # "Minkowski Distance" : scipy.spatial.distance.minkowski, \ "Canberra Distance": scipy.spatial.distance.canberra} metric_res = [] merged = pd.merge(truth, est, on='id', how='outer').fillna(0) MARD = mard(merged, tru_col, est_col) pcc = merged[[tru_col, est_col]].corr(method='pearson')[tru_col][est_col] sp = merged[[tru_col, est_col]].corr(method='spearman')[tru_col][est_col] metric_dict = {'mard': MARD, 'pcc': pcc, 'sp': sp} for k, v in metrics.items(): val = v(merged[tru_col], merged[est_col]) if k == "Kendall Tau": metric_dict[k] = val.correlation else: metric_dict[k] = val df = pd.DataFrame(metric_dict, index=[0]) merged.columns = ['refid', 'truth', 'pred', 'ard'] merged['abs_err'] = abs(merged['truth'] - merged['pred']) return merged, df.transpose()
def main(): """This method invokes the training functions for development purposes""" # Read data from a file print("Running train.py") # Hard code the parameters for training the model parameters = { 'learning_rate': 0.02, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.7, 'num_leaves': 60, 'min_data': 100, 'min_hessian': 1, 'verbose': 2 } # Load the training data as dataframe data_dir = "data" data_file = os.path.join(data_dir, 'porto_seguro_safe_driver_prediction_input.csv') train_df = pd.read_csv(data_file) data = split_data(train_df) # Train the model model = train_model(data, parameters) # Log the metrics for the model metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): print(f"{k}: {v}")
def get_outcome( self, options: Dict[str, Union[None, List[str]]] = {} ) -> Dict[str, Union[float, type(np.nan)]]: opt = deepcopy(self.default_options) opt.update(options) y_true = self.input_images[0] # gt y_pred = self.input_images[1] # seg if str(y_true.dtype) != 'bool': y_true = 0 < y_true if str(y_pred.dtype) != 'bool': y_pred = 0 < y_pred metrics_ = self.new_ordered_dict() own_metrics = self._compute_metrics_own_implementation( y_true, y_pred, opt) metrics_.update({str(k): met for k, met in own_metrics.items()}) sklearn_metrics = self._compute_metrics_sklearn(y_true, y_pred, opt) metrics_.update({ str(k): -1 if (np.isnan(met) or met is None) else met for k, met in sklearn_metrics.items() }) medpy_metrics = self._compute_metrics_medpy(y_true, y_pred, opt) metrics_.update({ str(k): -1 if (np.isnan(met) or met is None) else met for k, met in medpy_metrics.items() }) return metrics_
def mean(self, phase, epoch, item=None): mean_metrics = {} metrics = self.get_metrics(phase=phase, epoch=epoch, item=item) metrics = metrics[phase][epoch] for key, value in metrics.items(): mean_metrics[key] = np.mean(np.array(value)) return mean_metrics
def add_metrics_to_spreadsheet(spreadsheet, model_metrics): df_metrics = None if not (isinstance(model_metrics, dict)): model_metrics = {'': model_metrics} for model, metrics in model_metrics.items(): model = '_%s' % model standard_metrics = {'metric': [], model: []} for metric, value in metrics.items(): if 'curve' in metric: df = pd.DataFrame.from_dict(value, orient='columns') df.to_excel(spreadsheet, '%s_curve%s' % (metric.split('_')[0], model), index=False) elif 'score' in metric: standard_metrics['metric'].append(metric) standard_metrics[model].append(value) else: if not isinstance(value, pd.DataFrame): value = pd.DataFrame(value) value.to_excel( spreadsheet, '%s%s' % (metric.replace('classification', 'class').replace( 'confusion', 'conf'), model), index=False) if df_metrics is None: df_metrics = pd.DataFrame.from_dict(standard_metrics) else: df_metrics = pd.merge(df_metrics, pd.DataFrame.from_dict(standard_metrics), on='metric') df_metrics.to_excel(spreadsheet, 'metric_comparison', index=False)
def find_best_model_for_time(results): best_overall = 1000 best_models = {} print("This is the result for time") for model, iters in results.items(): best_rate_so_far = 1000 best_models[model] = {} print(model) for parameters, metrics in iters.items(): for metric, rate in metrics.items(): if metric == "time": # print(metric) # print(rate) if rate < best_rate_so_far: best_rate_so_far = rate best_model = model best_parameter = parameters best_models[model]['parameters'] = parameters best_models[model]['metrics'] = metrics # print("{} is the best rate so far for model {}, parameter {}".format(best_rate_so_far, best_model, best_parameter)) to_append = [value for value in best_models[model]['metrics'].values()] to_append.append(best_models[model]['parameters']) if best_rate_so_far < best_overall: best_overall = best_rate_so_far overall_model = model overall_parameter = parameters print(best_parameter) print(best_rate_so_far) print( "And the award for overal winner goes to ... {} under {} at {}".format( overall_model, overall_parameter, best_overall)) print("\n")
def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run: ''' This will begin a new interactive run on the existing AzureML Experiment. When a previous run was still active, it will be completed. Args: description (str): An optional description that will be added to the run metadata copy_folder (bool): Indicates if the output folder should be snapshotted and persisted metrics (dict): The metrics that should be logged in the run already Returns: Run: the AzureML Run object that can be used for further access and custom logic ''' if(self.__current_run is not None): self.__current_run.complete() if(copy_folder): self.__current_run = self.__experiment.start_logging() else: self.__current_run = self.__experiment.start_logging(snapshot_directory = None) if(metrics is not None): for k, v in metrics.items(): self.__current_run.log(k, v) if(description is not None): self.__current_run.log('Description', description) return self.__current_run
def _my_create_evaluator(model, metrics={}, add_index=False, device=None, non_blocking=False, prepare_batch=ignite.engine._prepare_batch): if device: model.to(device) def _inference(engine, batch): model.eval() with torch.no_grad(): x, y = prepare_batch(batch[:2], device=device, non_blocking=non_blocking) y_pred = model(x) #if add_index: index = batch[2] return {'prediction': y_pred, 'target': y, 'idx': index} #else: # return {'prediction': y_pred, 'target': y} engine = Engine(_inference) for name, metric in metrics.items(): metric.attach(engine, name) return engine
def compute_rejection_curves(probas, labels, groups, metrics): order = np.argsort(probas) labels = labels[order] groups = {g_name: group[order] for g_name, group in groups.items()} pred_labels = np.ones(len(labels)) metrics_res = {metric: [metric_computation(labels, pred_labels, groups)] \ for metric, metric_computation in metrics.items()} for i in range(len(probas)): pred_labels[i] = 0 for metric, metric_computation in metrics.items(): metrics_res[metric].append( metric_computation(labels, pred_labels, groups)) #assert math.isclose(fairnesses[0], fairnesses[-1]) return metrics_res
def write_to_tensorboard( metrics, global_step, logdir, ): """Writes metrics to tensorbaord.""" with tf.summary.FileWriter(logdir) as writer: for label, value in metrics.items(): summary = tf.Summary( value=[tf.Summary.Value(tag=label, simple_value=value)]) writer.add_summary(summary, global_step)
def log_evaluation_results(engine): model.eval() for name, imgs in example_images.items(): imgs = make_example_images(autoencoder, imgs, device=device) summary_writer.add_image(name, imgs, engine.state.iteration) evaluator.run(loaders.valid) metrics = evaluator.state.metrics prefix = nvly.engine.get_log_prefix(engine) msgs = ', '.join( [f'{name}: {value:.3f}' for name, value in metrics.items()]) print(f'{prefix} {msgs}') for name, value in metrics.items(): summary_writer.add_scalar(f'metrics/{name}', value, engine.state.iteration) nonlocal best_score best_score = max(best_score, metrics['roc_auc'])
def evaluation_dict(y_test, y_test_pred, run_time): '''Creates a dictionary with 5 prediction evaluation metrics and a measure of how long it took the model to run.''' results = {} metrics = {'Accuracy': accuracy_score, 'F1_Score': f1_score, 'Precision': precision_score, 'Recall': recall_score, 'AUC': roc_auc_score} for label, fn in metrics.items(): results[label] = round(fn(y_test, y_test_pred), 4) results['Train Time (s)'] = run_time return results
def get_p_values(metrics): """Get p_values.""" p_values = {} result_list = map(dict, itertools.combinations(metrics.items(), 2)) for ch in result_list: values = list(ch.values()) ttest = stats.ttest_rel(a=values[0], b=values[1]) p_values[str(tuple(ch.keys()))] = ttest.pvalue return p_values
def print_best_MRR_and_hits(self): """ Print best results on validation set, and corresponding scores (with same hyper params) on test set """ # tools.logger.info( "Validation metrics:") metrics = self.valid_results.print_MRR_and_hits() tools.logger.info("Corresponding Test metrics:") for model_s, (best_rank, best_lambda, _, _, _, _, _) in metrics.items(): self.results.print_MRR_and_hits_given_params( model_s, best_rank, best_lambda)
def test(args, model, loader, prefix='', verbose=True): """ This function does a single pass through the testing set and evaluates the average dice_score, and average loss. params: args are the run parameters, model is the model being tested, loader is the pytorch test loader, prefix is a name string, verbose flag is for printing metrics return: dictionary of metric values """ print("train: Beginning test") metrics = defaultdict(list) t = tqdm(loader) with torch.no_grad(): for (img, seg) in t: img = img.to(args.device) seg = seg_utils.map_segmentation(seg, args.num_classes).long() seg_hot = seg_utils.one_hot_encode(seg, args.num_classes).to(args.device) seg = seg.to(args.device) seg_hat = model(img) if args.loss_func=='dice': dice_loss = loss_fns.DiceLoss() loss = dice_loss(seg_hat, seg_hot) elif args.loss_func=='crossentropy': log_loss = nn.CrossEntropyLoss() loss = log_loss(seg_hat, seg) t.set_postfix_str(s='loss: %f'% loss.item()) try: metrics['loss'].append(loss.item()) except ValueError: print(metrics) for k, v in get_metrics(seg_hot, seg_hat).items(): metrics[k].append(v) for k in replace_metric_by_mean: metrics[k] = np.mean(metrics[k]) # Print! if verbose: start_string = '#### {} evaluation ####'.format(prefix) print(start_string) for k, v in metrics.items(): print('#### {} = {}'.format(k, v)) print(''.join(['#' for _ in range(len(start_string))])) return metrics
def get_metrics(target, pred, errors, name, settings): import sklearn.metrics import scipy metrics = {} if settings.get("classification"): metrics["roc_auc"] = score = sklearn.metrics.roc_auc_score( target.reshape(-1, 1), pred.reshape(-1, 1) ) metrics["ap_score"] = ap_score = sklearn.metrics.average_precision_score( target.reshape(-1, 1), pred.reshape(-1, 1), average="micro" ) print(f"ROC-AUC: {score:3.3f}, AP: {ap_score:3.3f}") else: mae = metrics["mae"] = sklearn.metrics.mean_absolute_error(target, pred) try: mape = metrics["mape"] = sklearn.metrics.mean_absolute_percentage_error( target, pred ) except AttributeError: mape = metrics["mape"] = 1e20 mse = metrics["mse"] = sklearn.metrics.mean_squared_error(target, pred) med_ae = metrics["med_ae"] = sklearn.metrics.median_absolute_error(target, pred) max_ae = metrics["max_ae"] = sklearn.metrics.max_error(target, pred) fit_results = scipy.stats.linregress( x=target.reshape( -1, ), y=pred.reshape( -1, ), ) slope = metrics["slope"] = fit_results.slope rvalue = metrics["rvalue"] = fit_results.rvalue print( f"MAE = {mae:3.3f} {settings.get('units', '')}, MedianAE = {med_ae:3.3f} {settings.get('units', '')}, MAPE = {mape:3.3f}, √MSE = {np.sqrt(mse):3.3f} {settings.get('units', '')}" # noqa ) print( f"MaxAE = {max_ae:3.3f} {settings.get('units', '')}, slope = {slope:3.2f}, R = {rvalue:3.2f}" ) for k,v in metrics.items(): metrics[k] = float(v) with open(f"results/{name}_metrics.json", "w") as f: json.dump(metrics, f) return metrics
def get_cross_validation_metrics(estimator, X, y, metrics): metric_scorers = {} for metric, func in metrics.items(): metric_scorers[metric] = make_scorer(func) results = {} cv = StratifiedKFold(n_splits=10) metric_values = cross_validate(estimator, X, y, scoring=metric_scorers, cv=cv) for metric in metrics: results[metric] = metric_values["test_" + metric].mean() return results
def calc_taxlevel_metrics(truer, est): true = pd.read_table(truer) estimated = pd.read_table(est) metrics = { "R^2" : sklearn.metrics.r2_score, \ "Explained Var." : sklearn.metrics.explained_variance_score, \ "Mean Abs Error" : sklearn.metrics.mean_absolute_error, \ "Mean Sq. Error" : sklearn.metrics.mean_squared_error, \ "Mean Sq. Log. Error" : sklearn.metrics.mean_squared_log_error, \ "Med Abs Error" : sklearn.metrics.median_absolute_error, \ "Bray Curtis" : scipy.spatial.distance.braycurtis, \ "Kendall Tau" : scipy.stats.kendalltau, \ "Cosine Similarity" : scipy.spatial.distance.cosine, \ # "Minkowski Distance" : scipy.spatial.distance.minkowski, \ "Canberra Distance": scipy.spatial.distance.canberra} metric_res = {} for r in ["Phylum", "Genus", "Species", "Scietific_Name", "TaxID"]: metric_res[r] = {} tr_taxid_counts = true.groupby(r)['NumCounts'].sum().reset_index() es_taxid_counts = estimated.groupby(r)['NumCounts'].sum().reset_index() merged = pd.merge(tr_taxid_counts, es_taxid_counts, on=r, how='outer').fillna(0) MARD = mard(merged, 'NumCounts_x', 'NumCounts_y') pcc = merged[['NumCounts_x', 'NumCounts_y' ]].corr(method='pearson')['NumCounts_x']['NumCounts_y'] sp = merged[['NumCounts_x', 'NumCounts_y' ]].corr(method='spearman')['NumCounts_x']['NumCounts_y'] metric_res[r]['mard'] = MARD metric_res[r]['pcc'] = pcc metric_res[r]['sp'] = sp #metric_res[r] += [MARD, pcc, sp] for k, v in metrics.items(): val = v(merged['NumCounts_x'], merged['NumCounts_y']) if k == "Kendall Tau": metric_res[r][k] = val.correlation else: metric_res[r][k] = val df = pd.DataFrame(metric_res) #df.columns=['mard', 'pcc', 'sp', 'r2', 'ex_var', 'mae', 'mse', 'msle', 'medae'] merged.columns = ['taxid', 'truth', 'pred', 'ard'] merged['abs_err'] = abs(merged['truth'] - merged['pred']) return merged, df[[ 'Phylum', 'Genus', 'Species', 'Scietific_Name', 'TaxID' ]]
def test(args, model, loader, prefix='', verbose=True): print("train: Beginning test") metrics = defaultdict(list) t = tqdm(loader) with torch.no_grad(): for (img, seg) in t: img = img.to(args.device) seg = map_segmentation(seg, args.num_classes).long() seg_hot = seg_utils.one_hot_encode(seg, args.num_classes).to(args.device) seg = seg.to(args.device) seg_hat = model(img) if args.loss_func=='dice': dice_loss = loss_fns.DiceLoss() loss = dice_loss(seg_hat, seg_hot) elif args.loss_func=='crossentropy': log_loss = nn.CrossEntropyLoss() loss = log_loss(seg_hat, seg) t.set_postfix_str(s='loss: %f'% loss.item()) try: metrics['loss'].append(loss.item()) except ValueError: print(metrics) for k, v in get_metrics(seg_hot, seg_hat).items(): metrics[k].append(v) for k in replace_metric_by_mean: metrics[k] = np.mean(metrics[k]) # Print! if verbose: start_string = '#### {} evaluation ####'.format(prefix) print(start_string) for k, v in metrics.items(): print('#### {} = {}'.format(k, v)) print(''.join(['#' for _ in range(len(start_string))])) return metrics
def run_epoch(self, epoch, sents, labels, dev_sents, dev_labels): """ Performs one complete pass over the train set and evaluate on dev Args: epoch: sents: dataset sentences that yields tuple of sentences, tags labels: dataset label by sentenses that yields tuple of sentences, tags dev_sents: data for avaluate dev_labels: Return: f1: (python float), score to select model on, higher is better """ # progbar stuff for logging batch_size = self.config.batch_size nbatches = (len(sents) + batch_size - 1) // batch_size prog = Progbar(target=nbatches) # iterate over dataset train_loss = 0 curren_state = np.zeros((2, self.config.batch_size, self.config.hidden_size_lstm)) # initial state cell, hidden layer. for i, (words, labels_) in enumerate(minibatches(sents, labels, batch_size)): fd, _ = self.get_feed_dict(words, labels_, self.config.lr, self.config.dropout) _, train_loss= self.sess.run( [self.train_op, self.loss], feed_dict=fd) # get final state of previous batch. if(i%20==0) or i== (len(sents)//batch_size-1): prog.update(i + 1, [("train loss", train_loss)]) # tensorboard # if i % 10 == 0: # self.file_writer.add_summary(summary, epoch*nbatches + i) print('\tlearning rate: {:.5f}'.format(self.config.lr)) metrics = self.run_evaluate(dev_sents,dev_labels) msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in metrics.items()]) self.logger.info(msg) return train_loss, metrics["f1"]
def calculateRank(metrics, result): """ calculate and get optimal library for each dataset and based on user-set weights - Input - metrics: output from callLibrary (performance metrics for each library) - weights: user-defined weights - Output - ranks: sorted order of libraried based on weighted sum """ weighted_score = {} for key, value in metrics.items(): weighted_score[key] = result["precision"] * value["precision"] + \ result["accuracy"] * value["accuracy"] + \ result["specificity"] * value["specificity"] + \ result["f1score"] * value["f1score"] + \ result["sensitivity"] * value["sensitivity"] ranks = dict(sorted(weighted_score.items(), key=lambda x: x[1])) return ranks
def buildModel(df, keyFeatures, target): fittingFrame = df.copy() keyFeatures.append(target) fittingFrame["Name"] = pd.Categorical(fittingFrame["Name"]) fittingFrame = fittingFrame[keyFeatures] categoryColumns = fittingFrame.select_dtypes(include=['category']).columns for col in categoryColumns: currentCategorical = pd.get_dummies(fittingFrame[col]) fittingFrame = pd.concat([fittingFrame, currentCategorical], axis=1) fittingFrame = fittingFrame.drop([col], axis=1) #print(df.info()) rf = RandomForestRegressor(n_estimators=1800, max_features='auto', min_samples_split=4, min_samples_leaf=1, max_depth=60) metrics = cross_validation_metrics(fittingFrame, target, 10) metric_frame = pd.DataFrame(list(metrics.items()), columns=['Metric Name', 'Metric Value']) X = fittingFrame.drop([target], axis=1) connection = { 'host': config.host, 'dbname': config.dbname, 'username': config.username, 'password': config.password, 'port': config.port } saveMetrics(metric_frame, connection) rf.fit(X, fittingFrame[target]) # returns = { # 'model':rf, # 'metrics':metrics # } return (rf)
def eva_metrics(self): ''' Given a classifier, evaluate by various metrics Input: y_true: a Pandas dataframe of actual label value y_pred: a Pandas dataframe of predicted label value y_pred_probs: a Pandas dataframe of probability estimates Output: rv: a dictionary where key is the metric and value is the score ''' rv = {} metrics = { 'accuracy': accuracy_score, 'f1_score': f1_score, 'precision': precision_score, 'recall': recall_score, 'auc': roc_auc_score } for metric, fn in metrics.items(): rv[metric] = fn(self.y_true, self.y_pred) y_pred_probs_sorted, y_true_sorted = zip( *sorted(zip(self.y_pred_probs, self.y_true), reverse=True)) levels = [1, 3, 5] for k in levels: rv['p_at_' + str(k) + '%'] = precision_at_k( y_true_sorted, y_pred_probs_sorted, k) rv['r_at_' + str(k) + '%'] = recall_at_k(y_true_sorted, y_pred_probs_sorted, k) rv['p_at_200'] = precision_at_k(y_true_sorted, y_pred_probs_sorted, 200, False) rv['r_at_200'] = recall_at_k(y_true_sorted, y_pred_probs_sorted, 200, False) return rv
def create_supervised_evaluator(model, metrics={}, device=None, forward_fn=None): def _inference(engine, batch): # now compute error model.eval() with torch.no_grad(): x, y = utils_data.nestedDictToDevice( batch, device=device) # make it work for dict input too if forward_fn is None: y_pred = model(x) else: y_pred = forward_fn(x) return y_pred, y engine = Engine(_inference) for name, metric in metrics.items(): metric.attach(engine, name) return engine
def print_best_MRR_and_hits_per_rel(self): """ Print best results on validation set, and corresponding scores (with same hyper params) on test set """ # tools.logger.info( "Validation metrics:") metrics = self.valid_results.print_MRR_and_hits() tools.logger.info("Corresponding per relation Test metrics:") with open("/home/ksrao/Saikat/complex/relation_test.txt", 'w') as f: for rel_name, rel_idx in self.relations_dict.items(): tools.logger.info(rel_name) this_rel_row_idxs = self.test.indexes[:, 1] == rel_idx this_rel_test_indexes = self.test.indexes[this_rel_row_idxs, :] this_rel_test_values = self.test.values[this_rel_row_idxs] this_rel_set = tools.Triplets_set(this_rel_test_indexes, this_rel_test_values) f.write("%s\n" % rel_name) for model_s, (best_rank, best_lambda, _, _, _, _, _) in metrics.items(): rel_cv_results = self.results.extract_sub_scores( this_rel_row_idxs) rel_cv_results.print_MRR_and_hits_given_params( model_s, best_rank, best_lambda)
def inference( args, dlrm, best_acc_test, best_auc_test, test_ld, ): test_accu = 0 test_samp = 0 if args.print_auc: scores = [] targets = [] total_time = 0 total_iter = 0 if args.inference_only: dlrm = trace_model(args, dlrm, test_ld) if args.share_weight_instance != 0: run_throughput_benchmark(args, dlrm, test_ld) with torch.cpu.amp.autocast(enabled=args.bf16): for i, testBatch in enumerate(test_ld): should_print = ((i + 1) % args.print_freq == 0 or i + 1 == len(test_ld)) and args.inference_only if should_print: gT = 1000.0 * total_time / total_iter print( "Finished {} it {}/{}, {:.2f} ms/it,".format( "inference", i + 1, len(test_ld), gT), flush=True, ) total_time = 0 total_iter = 0 # early exit if nbatches was set by the user and was exceeded if args.inference_only and nbatches > 0 and i >= nbatches: break X_test, lS_o_test, lS_i_test, T_test, W_test, CBPP_test = unpack_batch( testBatch) # forward pass if not args.inference_only and isinstance( dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): n_tables = lS_i_test.shape[0] idx = [lS_i_test[i] for i in range(n_tables)] offset = [lS_o_test[i] for i in range(n_tables)] include_last = [False for i in range(n_tables)] indices, offsets, indices_with_row_offsets = dlrm.emb_l.linearize_indices_and_offsets( idx, offset, include_last) start = time_wrap() if not args.inference_only and isinstance( dlrm.emb_l, ipex.nn.modules.MergedEmbeddingBagWithSGD): Z_test = dlrm(X_test, indices, offsets, indices_with_row_offsets) else: Z_test = dlrm(X_test, lS_o_test, lS_i_test) total_time += (time_wrap() - start) total_iter += 1 if args.print_auc: S_test = Z_test.detach().cpu().float().numpy() # numpy array T_test = T_test.detach().cpu().float().numpy() # numpy array scores.append(S_test) targets.append(T_test) elif not args.inference_only: with record_function("DLRM accuracy compute"): # compute loss and accuracy S_test = Z_test.detach().cpu().float().numpy( ) # numpy array T_test = T_test.detach().cpu().float().numpy( ) # numpy array mbs_test = T_test.shape[0] # = mini_batch_size except last A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8)) test_accu += A_test test_samp += mbs_test else: # do nothing to save time pass if args.print_auc: with record_function("DLRM mlperf sklearn metrics compute"): scores = np.concatenate(scores, axis=0) targets = np.concatenate(targets, axis=0) metrics = { "recall": lambda y_true, y_score: sklearn.metrics.recall_score( y_true=y_true, y_pred=np.round(y_score)), "precision": lambda y_true, y_score: sklearn.metrics.precision_score( y_true=y_true, y_pred=np.round(y_score)), "f1": lambda y_true, y_score: sklearn.metrics.f1_score( y_true=y_true, y_pred=np.round(y_score)), "ap": sklearn.metrics.average_precision_score, "roc_auc": sklearn.metrics.roc_auc_score, "accuracy": lambda y_true, y_score: sklearn.metrics.accuracy_score( y_true=y_true, y_pred=np.round(y_score)), } validation_results = {} for metric_name, metric_function in metrics.items(): validation_results[metric_name] = metric_function(targets, scores) acc_test = validation_results["accuracy"] elif not args.inference_only: acc_test = test_accu / test_samp else: pass model_metrics_dict = { "nepochs": args.nepochs, "nbatches": nbatches, "nbatches_test": nbatches_test, } if not args.inference_only: model_metrics_dict["test_acc"] = acc_test if args.print_auc: is_best = validation_results["roc_auc"] > best_auc_test if is_best: best_auc_test = validation_results["roc_auc"] model_metrics_dict["test_auc"] = best_auc_test print( "recall {:.4f}, precision {:.4f},".format( validation_results["recall"], validation_results["precision"], ) + " f1 {:.4f}, ap {:.4f},".format(validation_results["f1"], validation_results["ap"]) + " auc {:.4f}, best auc {:.4f},".format( validation_results["roc_auc"], best_auc_test) + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( validation_results["accuracy"] * 100, best_acc_test * 100), flush=True, ) print("Accuracy: {:.34} ".format(validation_results["roc_auc"])) elif not args.inference_only: is_best = acc_test > best_acc_test if is_best: best_acc_test = acc_test print( " accuracy {:3.3f} %, best {:3.3f} %".format( acc_test * 100, best_acc_test * 100), flush=True, ) else: pass if not args.inference_only: return model_metrics_dict, is_best else: return
'ap': sklearn.metrics.average_precision_score, 'roc_auc': sklearn.metrics.roc_auc_score, 'accuracy': lambda y_true, y_score: sklearn.metrics. accuracy_score(y_true=y_true, y_pred=np.round(y_score)), # 'pre_curve' : sklearn.metrics.precision_recall_curve, # 'roc_curve' : sklearn.metrics.roc_curve, } # print("Compute time for validation metric : ", end="") # first_it = True validation_results = {} for metric_name, metric_function in metrics.items(): # if first_it: # first_it = False # else: # print(", ", end="") # metric_compute_start = time_wrap(False) validation_results[metric_name] = metric_function( targets, scores) # metric_compute_end = time_wrap(False) # met_time = metric_compute_end - metric_compute_start # print("{} {:.4f}".format(metric_name, 1000 * (met_time)), # end="") # print(" ms") gA_test = validation_results['accuracy'] gL_test = validation_results['loss'] else: