def _log_tensors(self, tensor_values): """Update timer, log tensors, send to MLFlow and Graphite""" self._timer.update_last_triggered_step(self._iter_count) global_step = tensor_values["global_step"] if self.skip_after_step is not None and global_step >= self.skip_after_step: return # Log tensor and function values ord_tensor_values = [(tag, tensor_values[tag]) for tag in self._tag_order] ord_function_values = [ (tag, self.functions[tag]()) for tag in self._fn_order ] if self.functions else [] LOGGER.info(", ".join( self.formatter(tag, value) for tag, value in ord_tensor_values + ord_function_values)) # Send to MLFlow and Graphite for tag, value in ord_tensor_values + ord_function_values: if self.use_graphite: graphite.log_metric(tag, value, postfix=self.name) if self.use_mlflow: tag = tag if self.name is None else f"{self.name}_{tag}" mlflow.log_metric(sanitize_metric_name(tag), value, step=global_step)
def run(self): with dpr.io.ParquetDataset(self.path_predictions).open() as ds: predictions = ds.read_pandas().to_pandas() users = np.stack(predictions["user"]) ones = np.ones([users.shape[0], 1], np.float32) users_with_ones = np.concatenate([users, ones], axis=-1) with dpr.io.ParquetDataset(self.path_embeddings).open() as ds: embeddings = ds.read_pandas().to_pandas() embeddings = embeddings.to_numpy() with dpr.io.ParquetDataset(self.path_biases).open() as ds: biases = ds.read_pandas().to_pandas() biases = biases.to_numpy() embeddings_with_biases = np.concatenate([embeddings, biases], axis=-1) index = faiss.IndexFlatIP(embeddings_with_biases.shape[-1]) index.add(np.ascontiguousarray(embeddings_with_biases)) _, indices = index.search(users_with_ones, k=self.num_queries) k_values = [self.k] if isinstance(self.k, int) else self.k for k in k_values: precision, recall, f1, ndcg = compute_metrics(predictions["input"], predictions["target"], indices, k=k) LOGGER.info( f"precision@{k} = {precision}\n" f"recall@{k} = {recall}\n" f"f1@{k} = {f1}\n" f"NDCG@{k} = {ndcg}" ) if self.use_mlflow: mlflow.log_metric(key=f"precision_at_{k}", value=precision) mlflow.log_metric(key=f"recall_at_{k}", value=recall) mlflow.log_metric(key=f"f1_at_{k}", value=f1) mlflow.log_metric(key=f"ndcg_at_{k}", value=ndcg)
def export(self, estimator: tf.estimator.Estimator): # Reload summaries and select best step LOGGER.info(f"Reloading summaries from {estimator.model_dir}") summaries = read_eval_metrics(estimator.eval_dir()).items() for step, metrics in sorted(summaries): LOGGER.info(f"- {step}: {metrics}") sorted_summaries = sorted(summaries, key=lambda t: t[1][self.metric]) if self.mode == BestMode.INCREASE: best_step, best_metrics = sorted_summaries[-1] elif self.mode == BestMode.DECREASE: best_step, best_metrics = sorted_summaries[0] else: raise ValueError(f"Mode {self.mode} not recognized.") LOGGER.info(f"Best summary at step {best_step}: {best_metrics}") # List available checkpoints and select closes to best_step checkpoints = Path(estimator.model_dir).glob(_CHEKPOINT_PATTERN) checkpoint_steps = [ int(re.findall(r"-(\d+).index", str(path))[0]) for path in checkpoints ] selected_step = sorted(checkpoint_steps, key=lambda step: abs(step - best_step))[0] LOGGER.info(f"Selected checkpoint {selected_step}") # Change checkpoint information with Path(estimator.model_dir, "checkpoint").open("r") as file: lines = file.read().split("\n") lines[0] = f'model_checkpoint_path: "model.ckpt-{selected_step}"' with Path(estimator.model_dir, "checkpoint").open("w") as file: file.write("\n".join(lines)) # Check that change is effective global_step = estimator.get_variable_value("global_step") if not global_step == selected_step: msg = f"Changed checkpoint file to use step {selected_step}, but estimator uses {global_step}" raise ValueError(msg) # Log to MLFlow if self.use_mlflow: mlflow.log_metric(key=self.tag, value=global_step)
def _log_and_record(self, elapsed_steps, elapsed_time, global_step): """Log Steps per second and write summary""" if self.skip_after_step is not None and global_step >= self.skip_after_step: return # Compute steps and number of examples per second metrics = { "steps_per_sec": elapsed_steps / elapsed_time, "examples_per_sec": self.batch_size * elapsed_steps / elapsed_time, } # Log tensor values LOGGER.info(", ".join(f"{tag} = {value:.2f}" for tag, value in metrics.items())) # Send to MLFlow and Graphite for tag, value in metrics.items(): if self.use_graphite: graphite.log_metric(tag, value, postfix=self.name) if self.use_mlflow: tag = tag if self.name is None else f"{self.name}_{tag}" mlflow.log_metric(tag, value, step=global_step)
def run(self): LOGGER.info(f"{self.key}: {self.value}") if self.use_mlflow: mlflow.log_metric(key=self.key, value=self.value)