def __init__( self, config: Config, *, rank: int = 0, num_workers: int = 1, use_gpu: int = 0, ray=None, ): if ray is None: # Avoid importing ray in the module. This allows a test-ray to # be passed in, and speeds up the CLI. import ray # type: ignore self.ray = ray self.rank = rank self.num_workers = num_workers self.gpu_id = self._resolve_gpu(use_gpu) self.nlp = init_nlp(Config(config), use_gpu=self.gpu_id) config = self.nlp.config.interpolate() self.T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [self.T["train_corpus"], self.T["dev_corpus"]] self.train_corpus, self.dev_corpus = resolve_dot_names(config, dot_names) self.before_to_disk = create_before_to_disk_callback(self.T["before_to_disk"]) allocator = self.T["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) self._evaluation_callback = lambda: {} self._results = [] self._has_evaluation_callback = False self.thread = None self.proxy = None self.n_grads_used = 0 self.n_grads_discarded = 0
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() if "seed" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] seed")) if "gpu_allocator" not in config["training"]: raise ValueError(Errors.E1015.format(value="[training] gpu_allocator")) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] if not isinstance(T["train_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.train_corpus", type=type(T["train_corpus"]) ) ) if not isinstance(T["dev_corpus"], str): raise ConfigValidationError( desc=Errors.E897.format( field="training.dev_corpus", type=type(T["dev_corpus"]) ) ) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] logger.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer for listener in proc.listening_components: if listener in frozen_components and name not in frozen_components: logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec if listener not in frozen_components and name in frozen_components: logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp
def train( nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, stdout: IO = sys.stdout, stderr: IO = sys.stderr, ) -> Tuple["Language", Optional[Path]]: """Train a pipeline. nlp (Language): The initialized nlp object with the full config. output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. stdout (file): A file-like object to write output messages. To disable printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable printing, set to io.StringIO. RETURNS (tuple): The final nlp object and the path to the exported model. """ # We use no_print here so we can respect the stdout/stderr options. msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Helper function to save checkpoints. This is a closure for convenience, # to avoid passing in all the args all the time. def save_checkpoint(is_best): with nlp.use_params(optimizer.averages): before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST) if is_best: # Avoid saving twice (saving will be more expensive than # the dir copy) if (output_path / DIR_MODEL_BEST).exists(): shutil.rmtree(output_path / DIR_MODEL_BEST) shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Components that should set annotations on update annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, optimizer, create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]), create_evaluation_callback(nlp, dev_corpus, score_weights), dropout=T["dropout"], accumulate_gradient=T["accumulate_gradient"], patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write( msg.info(f"Frozen components: {frozen_components}") + "\n") if annotating_components: stdout.write( msg.info(f"Set annotations on update for: {annotating_components}") + "\n") stdout.write( msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: for batch, info, is_best_checkpoint in training_step_iterator: if is_best_checkpoint is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) if output_path is not None: save_checkpoint(is_best_checkpoint) info["output_path"] = str(output_path / DIR_MODEL_LAST) log_step(info if is_best_checkpoint is not None else None) except Exception as e: if output_path is not None: stdout.write( msg.warn(f"Aborting and saving the final best model. " f"Encountered exception: {repr(e)}") + "\n") raise e finally: finalize_logger() if output_path is not None: save_checkpoint(False) # This will only run if we did't hit an error if optimizer.averages: nlp.use_params(optimizer.averages) if output_path is not None: stdout.write( msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST) + "\n") return (nlp, output_path / DIR_MODEL_LAST) else: return (nlp, None)
def pretrain( config: Config, output_dir: Path, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, use_gpu: int = -1, silent: bool = True, ): msg = Printer(no_print=silent) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) nlp = load_model_from_config(config) _config = nlp.config.interpolate() P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) corpus = dot_to_object(_config, P["corpus"]) corpus = registry.resolve({"corpus": corpus})["corpus"] batcher = P["batcher"] model = create_pretraining_model(nlp, P) optimizer = P["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: _resume_model(model, resume_path, epoch_resume, silent=silent) else: # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 objective = model.attrs["loss"] # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) msg.divider( f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: file_.write(model.get_ref("tok2vec").to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") # TODO: I think we probably want this to look more like the # 'create_train_batches' function? for epoch in range(epoch_resume, P["max_epochs"]): for batch_id, batch in enumerate(batcher(corpus(nlp))): docs = ensure_docs(batch) loss = make_update(model, docs, optimizer, objective) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0
import spacy from copy import copy from spacy_transformers.pipeline_component import DEFAULT_CONFIG from thinc.api import Config, set_gpu_allocator, require_gpu if spacy.prefer_gpu(): print("\n\033[92m" + "✔ Using GPU" + "\033[0m\n") set_gpu_allocator("pytorch") require_gpu(0) else: print("\n\033[91m" + "✘ NOT Using GPU!" + "\033[0m\n") config = copy(DEFAULT_CONFIG["transformer"]) config["model"]["name"] = "model/distilbert-base-nli-stsb-mean-tokens" nlp = spacy.blank("en") transformer = nlp.add_pipe("transformer", config=config) transformer.model.initialize() doc = nlp("hello world") tokvecs = doc._.trf_data.tensors[-1] print(tokvecs)
def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), component: str = Arg( ..., help= "Name of the pipeline component of which the model should be analysed" ), layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ Analyze a Thinc model implementation. Includes checks for internal structure and activations during training. DOCS: https://spacy.io/api/cli#debug-model """ setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, "parameters": parameters, "gradients": gradients, "attributes": attributes, "layers": layers, "print_before_training": P0, "print_after_init": P1, "print_after_training": P2, "print_prediction": P3, } config_overrides = parse_config_overrides(ctx.args) with show_validation_error(config_path): raw_config = util.load_config(config_path, overrides=config_overrides, interpolate=False) config = raw_config.interpolate() allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) pipe = nlp.get_pipe(component) if not hasattr(pipe, "model"): msg.fail( f"The component '{component}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model debug_model(config, T, nlp, model, print_settings=print_settings)