def prepare_environment(params: Params): """ Sets random seeds for reproducible experiments. This may not work as expected if you use this from within a python project in which you have already imported Pytorch. If you use the scripts/run_model.py entry point to training models with this library, your experiments should be reasonably reproducible. If you are using this from your own project, you will want to call this function before importing Pytorch. Complete determinism is very difficult to achieve with libraries doing optimized linear algebra due to massively parallel execution, which is exacerbated by using GPUs. Parameters ---------- params: Params object or dict, required. A ``Params`` object or dict holding the json parameters. """ seed = params.pop_int("random_seed", 13370) numpy_seed = params.pop_int("numpy_seed", 1337) torch_seed = params.pop_int("pytorch_seed", 133) if seed is not None: random.seed(seed) if numpy_seed is not None: numpy.random.seed(numpy_seed) if torch_seed is not None: torch.manual_seed(torch_seed) # Seed all GPUs with the same seed if available. if torch.cuda.is_available(): torch.cuda.manual_seed_all(torch_seed) log_pytorch_version_info()
def setUp(self): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG ) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. logging.getLogger("allennlp.common.params").disabled = True logging.getLogger("allennlp.nn.initializers").disabled = True logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(logging.INFO) logging.getLogger("urllib3.connectionpool").disabled = True log_pytorch_version_info() self.TEST_DIR = pathlib.Path(TEST_DIR) os.makedirs(self.TEST_DIR, exist_ok=True) # Due to a bug in pytest we'll end up with a bunch of logging errors if we try to # log anything within an 'atexit' hook. # When https://github.com/pytest-dev/pytest/issues/5502 is fixed we should # be able to remove this work-around. def _cleanup_archive_dir_without_logging(path: str): if os.path.exists(path): shutil.rmtree(path) self.patcher = mock.patch( "allennlp.models.archival._cleanup_archive_dir", _cleanup_archive_dir_without_logging ) self.mock_cleanup_archive_dir = self.patcher.start()
def setUp(self): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.DEBUG) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) log_pytorch_version_info() os.makedirs(self.TEST_DIR, exist_ok=True)
def setUp(self): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.DEBUG) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) log_pytorch_version_info() self.TEST_DIR = "/tmp/allennlp_tests/" os.makedirs(self.TEST_DIR, exist_ok=True)
def setUp(self): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.DEBUG) log_pytorch_version_info() os.makedirs(self.TEST_DIR, exist_ok=True) os.makedirs(self.CONLL_TRAIN_DIR + "english/annotations/test_topic/test_source/01/", exist_ok=True) os.makedirs(self.CONLL_VAL_DIR + "english/annotations/test_topic/test_source/01/", exist_ok=True)
def setup_method(self): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG ) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. logging.getLogger("allennlp.common.params").disabled = True logging.getLogger("allennlp.nn.initializers").disabled = True logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(logging.INFO) logging.getLogger("urllib3.connectionpool").disabled = True log_pytorch_version_info() self.TEST_DIR = pathlib.Path(TEST_DIR) os.makedirs(self.TEST_DIR, exist_ok=True)
def prepare_environment(params: Params): numpy_seed = params.pop_int("numpy_seed", 1337) torch_seed = params.pop_int("pytorch_seed", 133) if seed is not None: if numpy_seed is not None: if torch_seed is not None: torch.cuda.manual_seed_all(torch_seed) log_pytorch_version_info() LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {} def get_spacy_model( spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool ) -> SpacyModelType: options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ["vectors", "textcat"] if not pos_tags: if not parse: disable.append("parser") if not ner: disable.append("ner") try: except OSError: logger.warning( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) spacy_model_module = __import__(spacy_model_name) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options] @contextmanager def pushd(new_dir: PathType, verbose: bool = False) -> ContextManagerFunctionReturnType[None]: if verbose: try: yield finally: if verbose: logger.info(f"Changing directory back to {previous_dir}") os.chdir(previous_dir) @contextmanager def push_python_path(path: PathType) -> ContextManagerFunctionReturnType[None]: path = Path(path).resolve() path = str(path) try: yield finally: def import_module_and_submodules(package_name: str) -> None: with push_python_path("."): path = getattr(module, "__path__", []) path_string = "" if not path else path[0] for module_finder, name, _ in pkgutil.walk_packages(path): if path_string and module_finder.path != path_string: continue subpackage = f"{package_name}.{name}" import_module_and_submodules(subpackage) def peak_memory_mb() -> Dict[int, float]: if resource is None or sys.platform not in ("linux", "darwin"): peak_mb = 0.0 else: if sys.platform == "darwin": peak_mb = peak / 1_000_000 else: peak_mb = peak / 1_000 if is_distributed(): gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)] if dist.get_backend() == "nccl": gather_results = [x.cuda() for x in gather_results] results_dict: Dict[int, float] = {} for peak_mb_tensor in gather_results: worker = int(peak_mb_tensor[0]) peak_mb = round(float(peak_mb_tensor[1]), 3) results_dict[worker] = peak_mb return results_dict else: return {0: peak_mb} def gpu_memory_mb() -> Dict[int, int]: try: result = subprocess.check_output( ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"], encoding="utf-8", ) gpu_memory = [int(x) for x in result.strip().split("\n")] return {gpu: memory for gpu, memory in enumerate(gpu_memory)} except FileNotFoundError: return {} except: # noqa logger.warning( "unable to check gpu_memory_mb() due to occasional failure, continuing", exc_info=True ) return {} def ensure_list(iterable: Iterable[A]) -> List[A]: if isinstance(iterable, list): return iterable else: return list(iterable) def is_lazy(iterable: Iterable[A]) -> bool: return not isinstance(iterable, list) def int_to_device(device: Union[int, torch.device]) -> torch.device: if isinstance(device, torch.device): return device if device < 0: return torch.device(device) def log_frozen_and_tunable_parameter_names(model: torch.nn.Module) -> None: frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(model) logger.info("The following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("The following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) def get_frozen_and_tunable_parameter_names( model: torch.nn.Module, ) -> Tuple[Iterable[str], Iterable[str]]: frozen_parameter_names = ( ) tunable_parameter_names = ( name for name, parameter in model.named_parameters() if parameter.requires_grad ) return frozen_parameter_names, tunable_parameter_names def dump_metrics(file_path: Optional[str], metrics: Dict[str, Any], log: bool = False) -> None: if file_path: with open(file_path, "w") as metrics_file: if log: logger.info("Metrics: %s", metrics_json) def flatten_filename(file_path: str) -> str: def is_master( global_rank: int = None, world_size: int = None, num_procs_per_node: int = None ) -> bool: if not is_distributed(): return True if global_rank is None: global_rank = dist.get_rank() if world_size is None: world_size = dist.get_world_size() if num_procs_per_node is None and os.environ: num_procs_per_node = int(os.environ.get("ALLENNLP_PROCS_PER_NODE", world_size)) return global_rank % (world_size / num_procs_per_node) == 0 def is_distributed() -> bool: def sanitize_wordpiece(wordpiece: str) -> str: return wordpiece[2:] elif wordpiece.startswith("Ġ"): return wordpiece[1:] elif wordpiece.startswith("▁"): return wordpiece[1:] else: return wordpiece def sanitize_ptb_tokenized_string(text: str) -> str: if len(tokens) == 0: return text token_map = { "``": '"', "''": '"', "-lrb-": "(", "-rrb-": ")", "-lsb-": "[", "-rsb-": "]", "-lcb-": "{", "-rcb-": "}", "<s>": "", "</s>": "", } punct_forward = {"`", "$", "#"} punct_backward = {".", ",", "!", "?", ":", ";", "%", "'"} em_forward = {"(", "[", "{"} em_backward = {"n't", "na", ")", "]", "}"} new_tokens: List[str] = [] merge_fwd = False for i, orig_token in enumerate(tokens): tokens[i] = token_map[orig_token.lower()] if orig_token.lower() in token_map else orig_token if merge_fwd: tokens[i] = tokens[i - 1] + tokens[i] if len(tokens[i]) == 0: continue merge_bckwd = not merge_fwd and ( orig_token == "''" or new_token in em_backward or all(c in punct_backward for c in new_token) ) merge_fwd = ( orig_token == "``" or new_token in em_forward or all(c in punct_forward for c in new_token) ) if merge_bckwd and new_tokens: new_tokens[-1] += tokens[i] elif not new_tokens or not merge_fwd or i == len(tokens) - 1: reveal_type(new_tokens)