def cached_path(path: Union[pathlib.Path, str], url: str, unzip=True) -> pathlib.Path: if isinstance(path, str): path = pathlib.Path(path) msg_printer = Printer() if path.is_file() or path.is_dir(): msg_printer.info(f"{path} exists.") return path download_file(url=url, dest_filename=str(path)) if unzip: if zipfile.is_zipfile(str(path)): extract_zip(filename=str(path), destination_dir=str(path.parent)) if tarfile.is_tarfile(str(path)): if "tar" in path.suffix: mode = "r" elif "gz" in path.suffix: mode = "r:gz" else: mode = "r" extract_tar(filename=str(path), destination_dir=str(path.parent), mode=mode) return path
def main(uri, table_path, schema, write_mode): msg = Printer() project_id, dataset_id, _ = table_path.split(".") config = Config(project_id=project_id, dataset_id=dataset_id) client = config.client() table_ref = str_to_bq_ref(table_path) load_job_config = bq.LoadJobConfig() load_job_config.schema = client.schema_from_json(schema) load_job_config.source_format = bq.SourceFormat.NEWLINE_DELIMITED_JSON load_job_config.ignore_unknown_values = True load_job_config.write_disposition = "WRITE_APPEND" load_job_config.max_bad_records = 100 assert write_mode in ["CREATE_NEW", "WRITE_APPEND"] table_id = table_path.split(".")[-1] exists = any([ table_id == table.table_id for table in client.list_tables(client.dataset(dataset_id)) ]) if exists and write_mode == "CREATE_NEW": msg.info(f"{table_path} already exists. Write_mode: {write_mode}") client.delete_table(table_ref) table = bq.Table(table_ref, schema=client.schema_from_json(schema)) client.create_table(table) load_job = client.load_table_from_uri(uri, table_ref, job_config=load_job_config) with msg.loading("Loading data..."): load_job.result() msg.good("Data succesfully loaded!")
def run_on_all_states(f, index_slice=None): if index_slice is not None: states = list(us.STATES)[index_slice] else: states = list(us.STATES) run_task = catch_errors(f) results = [run_task(state) for state in states] successes = sum(result is Result.Success for result in results) errors = sum(result is Result.Error for result in results) printer = Printer() printer.info("Final result:") printer.info(f"{successes} were created successfully. {errors} errored.") printer.table( list( zip( [name for name in states], [ str(result) if result is not None else "Error" for result in results ], )), header=("State", "Created"), divider=True, )
def profile(model, inputs=None, n_texts=10000): """ Profile a spaCy pipeline, to find out which functions take the most time. Input should be formatted as one JSON object per line with a key "text". It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ msg = Printer() if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) inputs = inputs[:n_inputs] with msg.loading("Loading model '{}'...".format(model)): nlp = load_model(model) msg.good("Loaded model '{}'".format(model)) texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats()
def cached_path(path: pathlib.Path, url: str, unzip=True) -> pathlib.Path: msg_printer = Printer() if path.is_file() or path.is_dir(): msg_printer.info(f"{path} exists.") return path download_file(url=url, dest_filename=f"{str(path)}.zip") if unzip: extract_zip(filename=f"{path}.zip", destination_dir=str(path.parent))
def main(path, name="bert-base-uncased", lang="en"): msg = Printer() msg.info(f"Creating model for '{name}' ({lang})") with msg.loading(f"Setting up the pipeline..."): nlp = PyTT_Language(pytt_name=name, meta={"lang": lang}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name)) msg.good("Initialized the model pipeline") nlp.to_disk(path) msg.good(f"Saved '{name}' ({lang})") msg.text(f"Pipeline: {nlp.pipe_names}") msg.text(f"Location: {path}")
def main(path, name="bert-base-uncased", lang="en"): msg = Printer() msg.info(f"Creating model for '{name}' ({lang})") with msg.loading(f"Setting up the pipeline..."): nlp = TransformersLanguage(trf_name=name, meta={"lang": lang}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name)) msg.good("Initialized the model pipeline") nlp.to_disk(path) msg.good(f"Saved '{name}' ({lang})") msg.text(f"Pipeline: {nlp.pipe_names}") msg.text(f"Location: {path}") with msg.loading("Verifying model loads..."): nlp.from_disk(path) msg.good("Model loads!")
def debug_diff( config_path: Path, compare_to: Optional[Path], gpu: bool, optimize: Optimizations, pretraining: bool, markdown: bool, ): msg = Printer() with show_validation_error(hint_fill=False): user_config = load_config(config_path) if compare_to: other_config = load_config(compare_to) else: # Recreate a default config based from user's config lang = user_config["nlp"]["lang"] pipeline = list(user_config["nlp"]["pipeline"]) msg.info(f"Found user-defined language: '{lang}'") msg.info(f"Found user-defined pipelines: {pipeline}") other_config = init_config( lang=lang, pipeline=pipeline, optimize=optimize.value, gpu=gpu, pretraining=pretraining, silent=True, ) user = user_config.to_str() other = other_config.to_str() if user == other: msg.warn("No diff to show: configs are identical") else: diff_text = diff_strings(other, user, add_symbols=markdown) if markdown: md = MarkdownRenderer() md.add(md.code_block(diff_text, "diff")) print(md.text) else: print(diff_text)
def _resume_model( model: Model, resume_path: Path, epoch_resume: Optional[int], silent: bool = True ) -> int: msg = Printer(no_print=silent) msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) if epoch_resume is None: # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 else: # No epoch given and couldn't infer it raise ValueError(Errors.E1020) msg.info(f"Resuming from epoch: {epoch_resume}") return epoch_resume
def setup_gpu(use_gpu: int, silent=None) -> None: """Configure the GPU and log info.""" if silent is None: local_msg = Printer() else: local_msg = Printer(no_print=silent, pretty=not silent) if use_gpu >= 0: local_msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: local_msg.info("Using CPU") if gpu_is_available(): local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
def _resume_model( model: Model, resume_path: Path, epoch_resume: int, silent: bool = True ) -> None: msg = Printer(no_print=silent) msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 msg.info(f"Resuming from epoch: {epoch_resume}") else: msg.info(f"Resuming from epoch: {epoch_resume}")
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = [ "sdist" if create_sdist else "", "wheel" if create_wheel else "" ] msg.info( f"Building package artifacts: {', '.join(opt for opt in opts if opt)}" ) for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good( f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if name is not None: meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) license_path = package_path / model_name_v / "LICENSE" if license_path.exists(): shutil.move(str(license_path), str(main_path)) imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format(imports="\n".join(f"from . import {m}" for m in imports)) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel)
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text("The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models))) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1)
def init_config( *, lang: str, pipeline: List[str], optimize: str, gpu: bool, pretraining: bool = False, silent: bool = True, ) -> Config: msg = Printer(no_print=silent) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) # Filter out duplicates since tok2vec and transformer are added by template pipeline = [ pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer") ] defaults = RECOMMENDATIONS["__default__"] reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() variables = { "lang": lang, "components": pipeline, "optimize": optimize, "hardware": "gpu" if gpu else "cpu", "transformer_data": reco["transformer"], "word_vectors": reco["word_vectors"], "has_letters": reco["has_letters"], } if variables["transformer_data"] and not has_spacy_transformers(): msg.warn( "To generate a more effective transformer-based config (GPU-only), " "install the spacy-transformers package and re-run this command. " "The config generated now does not use transformers.") variables["transformer_data"] = None base_template = template.render(variables).strip() # Giving up on getting the newlines right in jinja for now base_template = re.sub(r"\n\n\n+", "\n\n", base_template) # Access variables declared in templates template_vars = template.make_module(variables) use_case = { "Language": lang, "Pipeline": ", ".join(pipeline), "Optimize for": optimize, "Hardware": variables["hardware"].upper(), "Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined] if template_vars.use_transformer # type: ignore[attr-defined] else None, } msg.info("Generated config template specific for your use case") for label, value in use_case.items(): msg.text(f"- {label}: {value}") with show_validation_error(hint_fill=False): config = util.load_config_from_str(base_template) nlp = util.load_model_from_config(config, auto_fill=True) config = nlp.config if pretraining: validate_config_for_pretrain(config, msg) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) config = pretrain_config.merge(config) msg.good("Auto-filled config with all values") return config
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") for epoch in range(n_iter): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts)
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]] ) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( "The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models)) ) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path())) ) if incompat_models or incompat_links: sys.exit(1)
def debug_data( lang, train_path, dev_path, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, ignore_validation=False, verbose=False, no_format=False, ): msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: lang_cls = get_lang_class(lang) nlp = lang_cls() msg.divider("Data format validation") # Load the data in one – might take a while but okay in this case train_data = _load_file(train_path, msg) dev_data = _load_file(dev_path, msg) # Validate data format using the JSON schema # TODO: update once the new format is ready train_data_errors = [] # TODO: validate_json dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: msg.good("Development data JSON format is valid") for error in train_data_errors: msg.fail("Training data: {}".format(error)) for error in dev_data_errors: msg.fail("Develoment data: {}".format(error)) if (train_data_errors or dev_data_errors) and not ignore_validation: sys.exit(1) # Create the gold corpus to be able to better analyze data with msg.loading("Analyzing corpus..."): train_data = read_json_object(train_data) dev_data = read_json_object(dev_data) corpus = GoldCorpus(train_data, dev_data) train_docs = list(corpus.train_docs(nlp)) dev_docs = list(corpus.dev_docs(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly gold_data = _compile_gold(train_docs, pipeline) train_texts = gold_data["texts"] dev_texts = set([doc.text for doc, gold in dev_docs]) msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) for pipe in [p for p in pipeline if p not in nlp.factories]: msg.fail("Pipeline component '{}' not available in factories".format(pipe)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( len(train_docs) ) if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( "It's recommended to use at least {} examples (minimum {})".format( BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD ), show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_data["n_words"] msg.info( "{} total {} in the data ({} unique)".format( n_words, "word" if n_words == 1 else "words", len(gold_data["words"]) ) ) most_common_words = gold_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True) ), show=verbose, ) if len(nlp.vocab.vectors): msg.info( "{} vectors ({} unique keys, {} dimensions)".format( len(nlp.vocab.vectors), nlp.vocab.vectors.n_keys, nlp.vocab.vectors_length, ) ) else: msg.info("No word vectors present in the model") if "ner" in pipeline: # Get all unique NER labels present in the data labels = set(label for label in gold_data["ner"] if label not in ("O", "-")) label_counts = gold_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False msg.divider("Named Entity Recognition") msg.info( "{} new {}, {} existing {}".format( len(new_labels), "label" if len(new_labels) == 1 else "labels", len(existing_labels), "label" if len(existing_labels) == 1 else "labels", ) ) missing_values = label_counts["-"] msg.text( "{} missing {} (tokens with '-' label)".format( missing_values, "value" if missing_values == 1 else "values" ) ) if new_labels: labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) if gold_data["ws_ents"]: msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"])) has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( "Low number of examples for new label '{}' ({})".format( label, label_counts[label] ) ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_docs, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format(label) ) has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid." ) if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_data["textcat"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info( "Text Classification: {} new label(s), {} existing label(s)".format( len(new_labels), len(existing_labels) ) ) if new_labels: labels_with_counts = _format_labels( gold_data["textcat"].most_common(), counts=True ) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_data["tags"]] tag_map = nlp.Defaults.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( len(labels), "label" if len(labels) == 1 else "labels", len(tag_map), "label" if len(tag_map) == 1 else "labels", ) ) labels_with_counts = _format_labels( gold_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) for label in non_tagmap: msg.fail( "Label '{}' not found in tag map for language '{}'".format( label, nlp.lang ) ) if "parser" in pipeline: msg.divider("Dependency Parsing") labels = [label for label in gold_data["deps"]] msg.info( "{} {} in data".format( len(labels), "label" if len(labels) == 1 else "labels" ) ) labels_with_counts = _format_labels( gold_data["deps"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good( "{} {} passed".format( good_counts, "check" if good_counts == 1 else "checks" ) ) if warn_counts: msg.warn( "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") ) if fail_counts: msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) if fail_counts: sys.exit(1)
class SectLabelDataset(BaseTextClassification, ClassNursery): def __init__( self, filename: str, dataset_type: str, max_num_words: int, max_instance_length: int, word_vocab_store_location: str, debug: bool = False, debug_dataset_proportion: float = 0.1, word_embedding_type: Union[str, None] = None, word_embedding_dimension: Union[int, None] = None, word_start_token: str = "<SOS>", word_end_token: str = "<EOS>", word_pad_token: str = "<PAD>", word_unk_token: str = "<UNK>", train_size: float = 0.8, test_size: float = 0.2, validation_size: float = 0.5, word_tokenization_type="vanilla", ): """ SectLabel Dataset - A logical section classification dataset from WING-NUS Parameters ---------- filename : str Name of the file where the SectLabel dataset is stored dataset_type : str Either of `[train, valid, test]` that this dataset represents max_num_words : int Maximum number of words to be included in the vocab. The top most frequent ``max_num_words`` will be included in the vocab. Everything else will be mapped to ``word_unk`` tag. max_instance_length : int The maximum length for every instance word_vocab_store_location : str The path where the word vocabulary will be stored debug : bool Is this dataset a debug dataset where a small portion will be used for testing purposes. debug_dataset_proportion : float The proportion of the dataset that would be used as debug dataset word_embedding_type : str The embedding type is any of those that are accepted in ``vocab.embedding_loader`` word_embedding_dimension : int Word embedding size. This might depend on the ``embedding_type`` that is used. word_start_token : str Every instance will be prepended with a ``word_start_token`` word_end_token : str Every instance will be appended with a ``word_end_token`` word_pad_token : str Token used for padding instances word_unk_token : str If word is not found in the training vocab, then the word is replaced with ``word_unk_token`` train_size : int The portion of the dataset that will be used for training test_size : int The portion of the dataset that will be used for testing validation_size : int The portion of the dataset that will be used for validation word_tokenization_type : int The kind of word tokenization. ``tokenizers.word_tokenizer`` has more information """ self.classname2idx = self.get_classname2idx() self.idx2classname = { idx: classname for classname, idx in self.classname2idx.items() } self.filename = filename self.train_size = train_size self.test_size = test_size self.validation_size = validation_size self.dataset_type = dataset_type self.debug = debug self.debug_dataset_proportion = debug_dataset_proportion self.max_instance_length = max_instance_length self.lines, self.labels = self.get_lines_labels(filename=self.filename) self.msg_printer = Printer() def __len__(self) -> int: return len(self.word_instances) def __getitem__(self, idx) -> Dict[str, Any]: line = self.lines[idx] label = self.labels[idx] return self.get_iter_dict(lines=line, labels=label) def get_lines_labels(self, filename: str) -> (List[str], List[str]): parsect_json = convert_sectlabel_to_json(filename) texts = [] labels = [] parsect_json = parsect_json["parse_sect"] for line_json in parsect_json: text = line_json["text"] label = line_json["label"] texts.append(text) labels.append(label) (train_lines, train_labels), (validation_lines, validation_labels), ( test_lines, test_labels, ) = self.get_train_valid_test_stratified_split(texts, labels, self.classname2idx) if self.dataset_type == "train": texts = train_lines labels = train_labels elif self.dataset_type == "valid": texts = validation_lines labels = validation_labels elif self.dataset_type == "test": texts = test_lines labels = test_labels if self.debug: # randomly sample `self.debug_dataset_proportion` samples and return num_text = len(texts) np.random.seed(1729) # so we can debug deterministically random_ints = np.random.randint( 0, num_text - 1, size=int(self.debug_dataset_proportion * num_text)) random_ints = list(random_ints) sample_texts = [] sample_labels = [] for random_int in random_ints: sample_texts.append(texts[random_int]) sample_labels.append(labels[random_int]) texts = sample_texts labels = sample_labels return texts, labels @classmethod def get_classname2idx(cls) -> Dict[str, int]: categories = [ "address", "affiliation", "author", "bodyText", "category", "construct", "copyright", "email", "equation", "figure", "figureCaption", "footnote", "keyword", "listItem", "note", "page", "reference", "sectionHeader", "subsectionHeader", "subsubsectionHeader", "tableCaption", "table", "title", ] categories = [(word, idx) for idx, word in enumerate(categories)] categories = dict(categories) return categories def get_num_classes(self) -> int: return len(self.classname2idx.keys()) def get_class_names_from_indices(self, indices: List) -> List[str]: """ Given a list of indices maps back to classnames Mostly used for inference and other higher level applications Parameters ---------- indices : List[int] A list of indices where every index is in ``[0, num_classes)`` Returns ------- """ return [self.idx2classname[idx] for idx in indices] def print_stats(self): num_instances = self.num_instances formatted = self.label_stats_table self.msg_printer.divider("Stats for {0} dataset".format( self.dataset_type)) print(formatted) self.msg_printer.info( f"Number of instances in {self.dataset_type} dataset - {num_instances}" ) def emits_keys(cls): return { "tokens": f"A torch.LongTensor of size `max_length`. " f"Example [0, 0, 1, 100] where every number represents an index in the vocab", "len_tokens": f"A torch.LongTensor. " f"Example [2] representing the number of tokens without padding", "label": f"A torch.LongTensor representing the label corresponding to the " f"instance. Example [2] representing class 2", "instance": f"A string that is padded to ``max_length``.", "raw_instance": f"A string that is not padded", } def get_iter_dict( self, lines: Union[List[str], str], labels: Optional[Union[str, List[str]]] = None, ): if isinstance(lines, str): lines = [lines] word_instances = self.word_tokenizer.tokenize_batch(lines) len_instances = [len(instance) for instance in word_instances] classnames2idx = SectLabelDataset.get_classname2idx() padded_instances = [] for word_instance in word_instances: padded_instance = pack_to_length( tokenized_text=word_instance, max_length=self.max_instance_length, pad_token=self.word_vocab.pad_token, add_start_end_token=True, start_token=self.word_vocab.start_token, end_token=self.word_vocab.end_token, ) padded_instances.append(padded_instance) tokens = self.word_numericalizer.numericalize_batch_instances( padded_instances) tokens = torch.LongTensor(tokens) tokens = tokens.squeeze(0) instances = [] for instance in padded_instances: instances.append(" ".join(instance)) raw_instances = [] for instance in word_instances: raw_instances.append(" ".join(instance)) len_tokens = torch.LongTensor(len_instances) # squeeze the dimensions if there are more than one dimension if len(instances) == 1: instances = instances[0] raw_instances = raw_instances[0] instance_dict = { "tokens": tokens, "len_tokens": len_tokens, "instance": instances, "raw_instance": raw_instances, } if labels is not None: if isinstance(labels, str): labels = [labels] labels = [classnames2idx[label] for label in labels] label = torch.LongTensor(labels) instance_dict["label"] = label return instance_dict
class Vocab: def __init__( self, instances: Optional[List[List[str]]] = None, max_num_tokens: int = None, min_count: int = 1, unk_token: str = "<UNK>", pad_token: str = "<PAD>", start_token: str = "<SOS>", end_token: str = "<EOS>", special_token_freq: float = 1e10, store_location: str = None, embedding_type: Union[str, None] = None, embedding_dimension: Union[int, None] = None, ): """ :param instances: type: List[List[str]] Pass in the list of tokenized instances from which vocab is built :param max_num_tokens: type: int The top `max_num_words` frequent words will be considered for vocabulary and the rest of them will be mapped to `unk_token` :param min_count: type: int All words that do not have min count will be mapped to `unk_token` :param unk_token: str This token will be used for unknown words :param pad_token: type: str This token will be used for <PAD> words :param start_token: type: str This token will be used for start of sentence indicator :param end_token: type: str This token will be used for end of sentence indicator :param special_token_freq: type: float special tokens should have high frequency. The higher the frequency, the more common they are :param store_location: type: str The users can provide a store location optionally. The vocab will be stored in the location If the file exists then, the vocab will be restored from the file, rather than building it. :param embedding_type: type: str The embedding type is the type of pre-trained embedding that will be loaded for all the words in the vocab optionally. You can refer to `WordEmbLoder` for all the available embedding types :param embedding_dimension: type: int Embedding dimension of the embedding type """ self.instances = instances self.max_num_tokens = max_num_tokens self.min_count = min_count self.unk_token = unk_token self.pad_token = pad_token self.start_token = start_token self.end_token = end_token self.special_token_freq = special_token_freq self.vocab = None self.orig_vocab = None self.idx2token = None self.token2idx = None self.store_location = store_location self.embedding_type = embedding_type self.embedding_dimension = embedding_dimension self.msg_printer = Printer() # store the special tokens self.special_vocab = { self.unk_token: (self.special_token_freq + 3, 0), self.pad_token: (self.special_token_freq + 2, 1), self.start_token: (self.special_token_freq + 1, 2), self.end_token: (self.special_token_freq, 3), } def map_tokens_to_freq_idx(self) -> Dict[str, Tuple[int, int]]: """ Build vocab from instances return the word -> (freq, idx) :return: """ all_tokens = [] for instance in self.instances: all_tokens.extend(instance) # counter will map a list to Dict[str, count] values counter = Counter(all_tokens) # order the order in decreasing order of their frequencies # List[Tuple] counter = sorted(counter.items(), key=itemgetter(1), reverse=True) vocab = {} for idx, (token, freq) in enumerate(counter): vocab[token] = (freq, len(self.special_vocab) + idx) # merge the two dictionaries # courtesy https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression vocab = {**vocab, **self.special_vocab} # BUG: if vocab and special vocab share same token, then # the index of the vocab will get overwritten by special vocab # the only way now is to recalculate indices based on frequencies vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True) new_vocab = {} for idx, (token, (freq, _)) in enumerate(vocab): new_vocab[token] = (freq, idx) return new_vocab def clip_on_mincount( self, vocab: Dict[str, Tuple[int, int]]) -> Dict[str, Tuple[int, int]]: """ Clip the vocab based on min count We decide to keep the word and it count We just change the idx of the token to idx of the unknown token :return: vocab: type: Dict[str, Tuple[int, int]] The new vocab """ for key, (freq, idx) in vocab.items(): if freq < self.min_count: vocab[key] = (freq, vocab[self.unk_token][1]) return vocab def clip_on_max_num( self, vocab: Dict[str, Tuple[int, int]]) -> Dict[str, Tuple[int, int]]: """ Clip the vocab based on the maximum number of words We return `max_num_words + len(self.special_vocab)` words effectively The rest of them will be mapped to `self.unk_token` :param vocab: type: Dict[str, Tuple[int, int]] :return: vocab: type: Dict[str, Tuple[int, int]] The new vocab """ for key, (freq, idx) in vocab.items(): if idx >= len(self.special_vocab) + self.max_num_tokens: vocab[key] = (freq, vocab[self.unk_token][1]) return vocab def _add_token(self, token: str, save_vocab: bool = False): """ Add token to an already existing vocabulary :param token: type str :return: """ try: vocab = self.vocab except AttributeError: self.msg_printer.fail("Please build vocab using build vocab") tokens = vocab.keys() indices = [idx for freq, idx in vocab.values()] indices = sorted(indices, reverse=True) highest_idx = indices[0] if token not in tokens: self.vocab[token] = (1, highest_idx + 1) self.idx2token[highest_idx + 1] = token self.token2idx[token] = highest_idx + 1 if save_vocab: self.save_to_file( self.store_location) # this can be expensive. def add_tokens(self, tokens: List[str]): try: vocab = self.vocab except AttributeError: self.msg_printer.fail("Please build vocab first") for token in tokens: self._add_token(token, save_vocab=False) if self.store_location: self.save_to_file(self.store_location) def build_vocab(self) -> Dict[str, Tuple[int, int]]: if self.store_location and os.path.isfile(self.store_location): vocab_object = self.load_from_file(self.store_location) self.msg_printer.good("Loaded vocab from file {0}".format( self.store_location)) self.vocab = vocab_object.vocab self.orig_vocab = vocab_object.orig_vocab self.idx2token = vocab_object.idx2token self.token2idx = vocab_object.token2idx vocab = vocab_object.vocab else: self.msg_printer.info("BUILDING VOCAB") vocab = self.map_tokens_to_freq_idx() self.orig_vocab = deepcopy( vocab) # dictionary are passed by reference. Be careful vocab = self.clip_on_mincount(vocab) vocab = self.clip_on_max_num(vocab) self.vocab = vocab self.idx2token = self.get_idx2token_mapping() self.token2idx = self.get_token2idx_mapping() if self.store_location: self.msg_printer.info("SAVING VOCAB TO FILE") self.save_to_file(self.store_location) return vocab def get_vocab_len(self) -> int: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") length = len(set(idx for freq, idx in self.vocab.values())) return length def get_orig_vocab_len(self) -> int: if not self.orig_vocab: raise ValueError("Build vocab first by calling build_vocab()") length = len(set(idx for freq, idx in self.orig_vocab.values())) return length def get_token2idx_mapping(self) -> Dict[str, int]: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") token2idx = {} for word, (freq, idx) in self.vocab.items(): token2idx[word] = idx return token2idx def get_idx2token_mapping(self) -> Dict[int, str]: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") idx2words = {} for word, (freq, idx) in self.vocab.items(): idx2words[idx] = word return idx2words def save_to_file(self, filename: str): """ :param filename: str The filename where the result to the file will be stored The vocab will be stored in the json file name Please make sure that this is a json filename :return: None The whole vocab object will be saved to the file """ if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") vocab_state = dict() vocab_state["options"] = { "max_num_words": self.max_num_tokens, "min_count": self.min_count, "unk_token": self.unk_token, "pad_token": self.pad_token, "start_token": self.start_token, "end_token": self.end_token, "special_token_freq": self.special_token_freq, "embedding_type": self.embedding_type, "embedding_dimension": self.embedding_dimension, "special_vocab": self.special_vocab, } vocab_state["vocab"] = self.vocab vocab_state["orig_vocab"] = self.orig_vocab try: with open(filename, "w") as fp: json.dump(vocab_state, fp) except FileNotFoundError: print("You passed {0} for the filename. Please check whether " "the path exists and try again".format(filename)) @classmethod def load_from_file(cls, filename: str) -> "Vocab": try: with open(filename, "r") as fp: vocab_state = json.load(fp) vocab_options = vocab_state["options"] vocab_dict = vocab_state["vocab"] orig_vocab_dict = vocab_state["orig_vocab"] # restore the object # restore all the property values from the file max_num_tokens = vocab_options["max_num_words"] min_count = vocab_options["min_count"] unk_token = vocab_options["unk_token"] pad_token = vocab_options["pad_token"] start_token = vocab_options["start_token"] end_token = vocab_options["end_token"] special_token_freq = vocab_options["special_token_freq"] store_location = filename embedding_type = vocab_options["embedding_type"] embedding_dimension = vocab_options["embedding_dimension"] vocab = cls( max_num_tokens=max_num_tokens, min_count=min_count, unk_token=unk_token, pad_token=pad_token, start_token=start_token, end_token=end_token, instances=None, special_token_freq=special_token_freq, store_location=store_location, embedding_type=embedding_type, embedding_dimension=embedding_dimension, ) # instead of building the vocab, set the vocab from vocab_dict vocab.set_vocab(vocab=vocab_dict) vocab.set_orig_vocab(orig_vocab_dict) idx2token = vocab.get_idx2token_mapping() token2idx = vocab.get_token2idx_mapping() vocab.set_idx2token(idx2token) vocab.set_token2idx(token2idx) return vocab except FileNotFoundError: print("You passed {0} for the filename. Please check whether " "the path exists and try again. Please pass " "a json file".format(filename)) def get_token_from_idx(self, idx: int) -> str: if not self.vocab: raise ValueError("Please build the vocab first") if not self.idx2token: self.idx2token = self.get_idx2token_mapping() try: if idx == self.special_vocab[self.unk_token][1]: return self.unk_token else: token = self.idx2token[idx] return token except KeyError: vocab_len = self.get_vocab_len() message = ("You tried to access idx {0} of the vocab " "The length of the vocab is {1}. Please Provide " "Number between {2}".format(idx, vocab_len, vocab_len - 1)) raise ValueError(message) def get_idx_from_token(self, token: str) -> int: if not self.vocab: raise ValueError("Please build the vocab first") if not self.token2idx: self.token2idx = self.get_token2idx_mapping() try: return self.token2idx[token] except KeyError: return self.token2idx[self.unk_token] def get_topn_frequent_words(self, n: int = 5) -> List[Tuple[str, int]]: idx2token = self.idx2token token_freqs = [] max_n = min(len(self.special_vocab) + n, self.get_vocab_len()) for idx in range(len(self.special_vocab), max_n): token = idx2token[idx] freq = self.orig_vocab[token][0] token_freqs.append((token, freq)) return token_freqs def print_stats(self) -> None: orig_vocab_len = self.get_orig_vocab_len() vocab_len = self.get_vocab_len() N = 5 top_n = self.get_topn_frequent_words(n=N) data = [ ("Original vocab length", orig_vocab_len), ("Clipped vocab length", vocab_len), ("Top {0} words".format(N), top_n), ] header = ("Stats Description", "#") table_string = wasabi.table(data=data, header=header, divider=True) self.msg_printer.divider("VOCAB STATS") print(table_string) def load_embedding(self) -> torch.FloatTensor: if not self.vocab: raise ValueError("Please build the vocab first") embedding_loader = EmbeddingLoader( token2idx=self.token2idx, embedding_type=self.embedding_type, embedding_dimension=self.embedding_dimension, ) indices = [key for key in self.idx2token.keys()] indices = sorted(indices) embeddings = [] for idx in indices: token = self.idx2token[idx] # numpy array appends to the embeddings array embedding = embedding_loader.vocab_embedding[token] embeddings.append(embedding) embeddings = torch.FloatTensor(embeddings) return embeddings def set_vocab(self, vocab: Dict[str, Tuple[int, int]]): self.vocab = vocab def set_orig_vocab(self, orig_vocab: Dict[str, Tuple[int, int]]): self.orig_vocab = orig_vocab def set_idx2token(self, idx2token: Dict[int, str]): self.idx2token = idx2token def set_token2idx(self, token2idx: Dict[str, int]): self.token2idx = token2idx def get_disp_sentence_from_indices(self, indices: List[int]) -> str: """ Given a set of indices in vocab, it returns a sentence mapping the index to string Parameters ---------- indices : List[int] A list of indices where every index is between ``[0, vocab_len-1)``. Returns ------- str A string representing the index """ pad_token_index = self.get_idx_from_token(self.pad_token) start_token_index = self.get_idx_from_token(self.start_token) end_token_index = self.get_idx_from_token(self.end_token) special_indices = [pad_token_index, start_token_index, end_token_index] token = [ self.get_token_from_idx(idx) for idx in indices if idx not in special_indices ] sentence = " ".join(token) return sentence
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, bilstm_depth=2, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, init_tok2vec=None, epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() if has_gpu: import torch torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) if not texts: msg.fail("Input file is empty", texts_loc, exits=1) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_start = int(model_name.group(0)[5:][:-4]) + 1 else: if not epoch_start: msg.fail( "You have to use the '--epoch-start' argument when using a renamed weight file for " "'--init-tok2vec'", exits=True, ) elif epoch_start < 0: msg.fail( "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size)): docs, count = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) skip_counter += count loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: msg.warn( "Skipped {count} empty values".format(count=str(skip_counter))) msg.good("Successfully finished pretrain")
import sys from wasabi import Printer from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data msg = Printer() commands = { "download": download, "link": link, "info": info, "train": train, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, "convert": convert, "package": package, "init-model": init_model, "profile": profile, "validate": validate, } if len(sys.argv) == 1: msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) sys.argv[0] = "spacy %s" % command if command in commands: plac.call(commands[command], sys.argv[1:]) else: available = "Available: {}".format(", ".join(commands)) msg.fail("Unknown command: {}".format(command), available, exits=1)
requirements = ["pyarmor==6.2.0"] source_path, package_data = obfuscate_package(name) package_dir = {"": source_path} ext_modules = None cmdclass = {"build_py": build_py} setup( name=name, version=version, description=description, long_description=long_description, long_description_content_type=long_description_content_type, url=url, author=author, author_email=author_email, #license = "MIT", keywords=keywords, install_requires=requirements, package_dir=package_dir, packages=find_packages(source_path), package_data=package_data, zip_safe=False, ext_modules=ext_modules, cmdclass=cmdclass, ) msg.info("Mode: Obfuscation = {}".format(IS_OBFUSCATE))
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = ["sdist" if create_sdist else "", "wheel" if create_wheel else ""] msg.info(f"Building package artifacts: {', '.join(opt for opt in opts if opt)}") for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good(f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if meta["requirements"]: msg.good( f"Including {len(meta['requirements'])} package requirement(s) from " f"meta and config", ", ".join(meta["requirements"]), ) if name is not None: if not name.isidentifier(): msg.fail( f"Model name ('{name}') is not a valid module name. " "This is required so it can be imported as a module.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", exits=1, ) if not _is_permitted_package_name(name): msg.fail( f"Model name ('{name}') is not a permitted package name. " "This is required to correctly load the model with spacy.load.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", exits=1, ) meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["name"] if not model_name.startswith(meta["lang"] + "_"): model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) for file_name in FILENAMES_DOCS: file_path = package_path / model_name_v / file_name if file_path.exists(): shutil.copy(str(file_path), str(main_path)) readme_path = main_path / "README.md" if not readme_path.exists(): readme = generate_readme(meta) create_file(readme_path, readme) create_file(package_path / model_name_v / "README.md", readme) msg.good("Generated README.md from meta.json") else: msg.info("Using existing README.md from pipeline directory") imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel_name_squashed = re.sub("_+", "_", model_name_v) wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) if "__" in model_name: msg.warn( f"Model name ('{model_name}') contains a run of underscores. " "Runs of underscores are not significant in installed package names.", )
def debug_data( lang, train_path, dev_path, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, ignore_validation=False, verbose=False, no_format=False, ): msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: lang_cls = get_lang_class(lang) nlp = lang_cls() msg.divider("Data format validation") # Validate data format using the JSON schema # TODO: update once the new format is ready # TODO: move validation to GoldCorpus in order to be able to load from dir train_data_errors = [] # TODO: validate_json dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: msg.good("Development data JSON format is valid") for error in train_data_errors: msg.fail("Training data: {}".format(error)) for error in dev_data_errors: msg.fail("Develoment data: {}".format(error)) if (train_data_errors or dev_data_errors) and not ignore_validation: sys.exit(1) # Create the gold corpus to be able to better analyze data loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): corpus = GoldCorpus(train_path, dev_path) try: train_docs = list(corpus.train_docs(nlp)) train_docs_unpreprocessed = list( corpus.train_docs_without_preprocessing(nlp)) except ValueError as e: loading_train_error_message = "Training data cannot be loaded: {}".format( str(e)) try: dev_docs = list(corpus.dev_docs(nlp)) except ValueError as e: loading_dev_error_message = "Development data cannot be loaded: {}".format( str(e)) if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) if loading_dev_error_message: msg.fail(loading_dev_error_message) sys.exit(1) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly gold_train_data = _compile_gold(train_docs, pipeline) gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) gold_dev_data = _compile_gold(dev_docs, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) for pipe in [p for p in pipeline if p not in nlp.factories]: msg.fail( "Pipeline component '{}' not available in factories".format(pipe)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn( "{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( len(train_docs)) if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( "It's recommended to use at least {} examples (minimum {})".format( BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD), show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info("{} total {} in the data ({} unique)".format( n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]))) if gold_train_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the training data".format( gold_train_data["n_misaligned_words"])) if gold_dev_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the dev data".format( gold_dev_data["n_misaligned_words"])) most_common_words = gold_train_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True)), show=verbose, ) if len(nlp.vocab.vectors): msg.info("{} vectors ({} unique keys, {} dimensions)".format( len(nlp.vocab.vectors), nlp.vocab.vectors.n_keys, nlp.vocab.vectors_length, )) else: msg.info("No word vectors present in the model") if "ner" in pipeline: # Get all unique NER labels present in the data labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-")) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False msg.divider("Named Entity Recognition") msg.info("{} new {}, {} existing {}".format( len(new_labels), "label" if len(new_labels) == 1 else "labels", len(existing_labels), "label" if len(existing_labels) == 1 else "labels", )) missing_values = label_counts["-"] msg.text("{} missing {} (tokens with '-' label)".format( missing_values, "value" if missing_values == 1 else "values")) if new_labels: labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if gold_train_data["ws_ents"]: msg.fail("{} invalid whitespace entity spans".format( gold_train_data["ws_ents"])) has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( "Low number of examples for new label '{}' ({})".format( label, label_counts[label])) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_docs, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format( label)) has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good( "No entities consisting of or starting/ending with whitespace") if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " "least {} instances of the new label".format( NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid.") if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_train_data["textcat"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info("Text Classification: {} new label(s), {} existing label(s)". format(len(new_labels), len(existing_labels))) if new_labels: labels_with_counts = _format_labels( gold_train_data["textcat"].most_common(), counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] tag_map = nlp.Defaults.tag_map msg.info("{} {} in data ({} {} in tag map)".format( len(labels), "label" if len(labels) == 1 else "labels", len(tag_map), "label" if len(tag_map) == 1 else "labels", )) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: msg.good("All labels present in tag map for language '{}'".format( nlp.lang)) for label in non_tagmap: msg.fail( "Label '{}' not found in tag map for language '{}'".format( label, nlp.lang)) if "parser" in pipeline: msg.divider("Dependency Parsing") # profile sentence length msg.info("Found {} sentence{} with an average length of {:.1f} words.". format( gold_train_data["n_sents"], "s" if len(train_docs) > 1 else "", gold_train_data["n_words"] / gold_train_data["n_sents"])) # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: msg.info("Found {} nonprojective train sentence{}".format( gold_train_unpreprocessed_data["n_nonproj"], "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "")) if gold_dev_data["n_nonproj"] > 0: msg.info("Found {} nonprojective dev sentence{}".format( gold_dev_data["n_nonproj"], "s" if gold_dev_data["n_nonproj"] > 1 else "")) msg.info("{} {} in train data".format( len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels")) msg.info("{} {} in projectivized train data".format( len(labels_train), "label" if len(labels_train) == 1 else "labels")) labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][ label] <= DEP_LABEL_THRESHOLD: msg.warn("Low number of examples for label '{}' ({})".format( label, gold_train_unpreprocessed_data["deps"][label])) has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if gold_train_data["deps"][ label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append("{}: {}".format( label, str(gold_train_data["deps"][label]))) if len(rare_projectivized_labels) > 0: msg.warn( "Low number of examples for {} label{} in the " "projectivized dependency trees used for training. You may " "want to projectivize labels such as punct before " "training in order to improve parser performance.".format( len(rare_projectivized_labels), "s" if len(rare_projectivized_labels) > 1 else "")) msg.warn("Projectivized labels with low numbers of examples: " "{}".format("\n".join(rare_projectivized_labels)), show=verbose) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn("The following labels were found only in the train data: " "{}".format( ", ".join(set(labels_train) - set(labels_dev))), show=verbose) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn("The following labels were found only in the dev data: " + ", ".join(set(labels_dev) - set(labels_train)), show=verbose) if has_low_data_warning: msg.text( "To train a parser, your data should include at " "least {} instances of each label.".format( DEP_LABEL_THRESHOLD), show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( "Multiple root labels ({}) ".format(", ".join( gold_train_unpreprocessed_data["roots"])) + "found in training data. spaCy's parser uses a single root " "label ROOT so this distinction will not be available.") # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( "Found {} nonprojective projectivized train sentence{}".format( gold_train_data["n_nonproj"], "s" if gold_train_data["n_nonproj"] > 1 else "")) if gold_train_data["n_cycles"] > 0: msg.fail( "Found {} projectivized train sentence{} with cycles".format( gold_train_data["n_cycles"], "s" if gold_train_data["n_cycles"] > 1 else "")) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good("{} {} passed".format( good_counts, "check" if good_counts == 1 else "checks")) if warn_counts: msg.warn("{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")) if fail_counts: msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) if fail_counts: sys.exit(1)
def train( nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, stdout: IO = sys.stdout, stderr: IO = sys.stderr, ) -> Tuple["Language", Optional[Path]]: """Train a pipeline. nlp (Language): The initialized nlp object with the full config. output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. stdout (file): A file-like object to write output messages. To disable printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable printing, set to io.StringIO. RETURNS (tuple): The final nlp object and the path to the exported model. """ # We use no_print here so we can respect the stdout/stderr options. msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Helper function to save checkpoints. This is a closure for convenience, # to avoid passing in all the args all the time. def save_checkpoint(is_best): with nlp.use_params(optimizer.averages): before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST) if is_best: # Avoid saving twice (saving will be more expensive than # the dir copy) if (output_path / DIR_MODEL_BEST).exists(): shutil.rmtree(output_path / DIR_MODEL_BEST) shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Components that should set annotations on update annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, optimizer, create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]), create_evaluation_callback(nlp, dev_corpus, score_weights), dropout=T["dropout"], accumulate_gradient=T["accumulate_gradient"], patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write( msg.info(f"Frozen components: {frozen_components}") + "\n") if annotating_components: stdout.write( msg.info(f"Set annotations on update for: {annotating_components}") + "\n") stdout.write( msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: for batch, info, is_best_checkpoint in training_step_iterator: if is_best_checkpoint is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) if output_path is not None: save_checkpoint(is_best_checkpoint) info["output_path"] = str(output_path / DIR_MODEL_LAST) log_step(info if is_best_checkpoint is not None else None) except Exception as e: if output_path is not None: stdout.write( msg.warn(f"Aborting and saving the final best model. " f"Encountered exception: {repr(e)}") + "\n") raise e finally: finalize_logger() if output_path is not None: save_checkpoint(False) # This will only run if we did't hit an error if optimizer.averages: nlp.use_params(optimizer.averages) if output_path is not None: stdout.write( msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST) + "\n") return (nlp, output_path / DIR_MODEL_LAST) else: return (nlp, None)
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = { "widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r") } msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open("wb") as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") for epoch in range(n_iter): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size)): docs = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts)
class Vocab: def __init__( self, instances: Optional[List[List[str]]] = None, max_num_tokens: int = None, min_count: int = 1, unk_token: str = "<UNK>", pad_token: str = "<PAD>", start_token: str = "<SOS>", end_token: str = "<EOS>", special_token_freq: float = 1e10, store_location: str = None, max_instance_length: int = 100, include_special_vocab: bool = True, preprocessing_pipeline: List[Callable] = None, ): """ Parameters ---------- instances : Optional[List[List[str]]] A list of tokenized instances max_num_tokens : int The maximum number of tokens to be used in the vocab All the other tokens above this number will be replaced by UNK. If this is not passed then the maximum possible number will be used min_count : int All words that do not have min count will be mapped to `unk_token` unk_token : str This token will be used for unknown words pad_token : str This token will be used for <PAD> words start_token : str This token will be used for start of line indicator end_token : str This token will be used for end of sentence indicator special_token_freq : float special tokens should have high frequency. store_location : str The users can provide a store location optionally. The vocab will be stored in the location If the file exists then, the vocab will be restored from the file, rather than building it. max_instance_length : int Every vocab is related to a namespace. Every instance in that namespace will be clipped or padded to this length include_special_vocab : bool Boolean value to indicate whether special vocab should be included or no If this is false, you will have to set add_start_end_token to False and you cannot pad your instances. This is mostly set for labels - such as for classification that require no padding. For such cases please make sure that min_count is always 1 and max_num_tokens is always None. Otherwise some of the labels will be missed and it might result in error preprocessing_pipeline: List[Callable] You can add a set of callables that take in a list of str and return a list of str for pre-processing. For example methods look at instance_preprocessing module in sciwing.preprocessing """ self.instances = instances self.max_num_tokens = max_num_tokens self.min_count = min_count self.unk_token = unk_token self.pad_token = pad_token self.start_token = start_token self.end_token = end_token self.special_token_freq = special_token_freq self.vocab = None self.orig_vocab = None self.idx2token = None self.token2idx = None self.store_location = store_location self.max_instance_length = max_instance_length self.include_special_vocab = include_special_vocab self.preprocessing_pipeline = preprocessing_pipeline self.msg_printer = Printer() # store the special tokens if self.include_special_vocab: self.special_vocab = { self.unk_token: (self.special_token_freq + 3, 0), self.pad_token: (self.special_token_freq + 2, 1), self.start_token: (self.special_token_freq + 1, 2), self.end_token: (self.special_token_freq, 3), } else: if self.min_count != 1: self.msg_printer.warn( "Warning: You are building vocab without special vocab. " "Please make sure that min_count is 1") if self.max_num_tokens is not None: self.msg_printer.warn( "You are building vocab without special vocab. Please make " "sure that max_num_tokens is None") self.special_vocab = {} if instances is not None: self.instances = list( flatten(instances)) # just flatten the entire instance if isinstance(self.instances[0], Token): self.instances = [tok.text for tok in self.instances] if self.preprocessing_pipeline: self.instances = self.apply_preprocessing() def apply_preprocessing(self): instances = deepcopy(self.instances) for preprocessor in self.preprocessing_pipeline: instances = preprocessor(instances) return instances def map_tokens_to_freq_idx(self) -> Dict[str, Tuple[int, int]]: """ Build vocab from instances return the word -> (freq, idx) :return: """ all_tokens = deepcopy(self.instances) # counter will map a list to Dict[str, count] values counter = Counter(all_tokens) # order the order in decreasing order of their frequencies # List[Tuple] counter = sorted(counter.items(), key=itemgetter(1), reverse=True) vocab = {} for idx, (token, freq) in enumerate(counter): vocab[token] = (freq, len(self.special_vocab) + idx) # merge the two dictionaries # courtesy https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression vocab = {**vocab, **self.special_vocab} # BUG: if vocab and special vocab share same token, then # the index of the vocab will get overwritten by special vocab # the only way now is to recalculate indices based on frequencies vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True) new_vocab = {} for idx, (token, (freq, _)) in enumerate(vocab): new_vocab[token] = (freq, idx) return new_vocab def clip_on_mincount( self, vocab: Dict[str, Tuple[int, int]]) -> Dict[str, Tuple[int, int]]: """ Clip the vocab based on min count We decide to keep the word and it count We just change the idx of the token to idx of the unknown token :return: vocab: type: Dict[str, Tuple[int, int]] The new vocab """ for key, (freq, idx) in vocab.items(): if freq < self.min_count: vocab[key] = (freq, vocab[self.unk_token][1]) return vocab def clip_on_max_num( self, vocab: Dict[str, Tuple[int, int]]) -> Dict[str, Tuple[int, int]]: """ Clip the vocab based on the maximum number of words We return `max_num_words + len(self.special_vocab)` words effectively The rest of them will be mapped to `self.unk_token` Parameters ---------- vocab : Dict[str, Tuple[int, int]] The mapping from token to idx and frequency Returns ------- Dict[str, Tuple[int, int]] The new vocab """ for key, (freq, idx) in vocab.items(): if idx >= len(self.special_vocab) + self.max_num_tokens: vocab[key] = (freq, vocab[self.unk_token][1]) return vocab def _add_token(self, token: str, save_vocab: bool = False): """ Add token to an already existing vocabulary :param token: type str :return: """ try: vocab = self.vocab except AttributeError: self.msg_printer.fail("Please build vocab using build vocab") tokens = vocab.keys() indices = [idx for freq, idx in vocab.values()] indices = sorted(indices, reverse=True) highest_idx = indices[0] if token not in tokens: self.vocab[token] = (1, highest_idx + 1) self.idx2token[highest_idx + 1] = token self.token2idx[token] = highest_idx + 1 if save_vocab: self.save_to_file( self.store_location) # this can be expensive. def add_tokens(self, tokens: List[str]): try: vocab = self.vocab except AttributeError: self.msg_printer.fail("Please build vocab first") for token in tokens: self._add_token(token, save_vocab=False) if self.store_location: self.save_to_file(self.store_location) def build_vocab(self) -> Dict[str, Tuple[int, int]]: if self.store_location and os.path.isfile(self.store_location): vocab_object = self.load_from_file(self.store_location) self.msg_printer.good("Loaded vocab from file {0}".format( self.store_location)) self.vocab = vocab_object.vocab self.orig_vocab = vocab_object.orig_vocab self.idx2token = vocab_object.idx2token self.token2idx = vocab_object.token2idx vocab = vocab_object.vocab else: self.msg_printer.info("BUILDING VOCAB") vocab = self.map_tokens_to_freq_idx() # dictionary are passed by reference. Be careful self.orig_vocab = deepcopy(vocab) # set max num of tokens to maximum possible if it is not set if self.max_num_tokens is None: self.max_num_tokens = len(self.orig_vocab.keys()) vocab = self.clip_on_mincount(vocab) vocab = self.clip_on_max_num(vocab) self.vocab = vocab self.idx2token = self.get_idx2token_mapping() self.token2idx = self.get_token2idx_mapping() if self.store_location: self.msg_printer.info("SAVING VOCAB TO FILE") self.save_to_file(self.store_location) return vocab def get_vocab_len(self) -> int: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") length = len(set(idx for freq, idx in self.vocab.values())) return length def get_orig_vocab_len(self) -> int: if not self.orig_vocab: raise ValueError("Build vocab first by calling build_vocab()") length = len(set(idx for freq, idx in self.orig_vocab.values())) return length def get_token2idx_mapping(self) -> Dict[str, int]: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") token2idx = {} for word, (freq, idx) in self.vocab.items(): token2idx[word] = idx return token2idx def get_idx2token_mapping(self) -> Dict[int, str]: if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") idx2words = {} for word, (freq, idx) in self.vocab.items(): idx2words[idx] = word return idx2words def save_to_file(self, filename: str): """ :param filename: str The filename where the result to the file will be stored The vocab will be stored in the json file name Please make sure that this is a json filename :return: None The whole vocab object will be saved to the file """ if not self.vocab: raise ValueError("Build vocab first by calling build_vocab()") vocab_state = dict() vocab_state["options"] = { "max_num_words": self.max_num_tokens, "min_count": self.min_count, "unk_token": self.unk_token, "pad_token": self.pad_token, "start_token": self.start_token, "end_token": self.end_token, "special_token_freq": self.special_token_freq, "special_vocab": self.special_vocab, } vocab_state["vocab"] = self.vocab vocab_state["orig_vocab"] = self.orig_vocab try: with open(filename, "w") as fp: json.dump(vocab_state, fp) except FileNotFoundError: print("You passed {0} for the filename. Please check whether " "the path exists and try again".format(filename)) @classmethod def load_from_file(cls, filename: str) -> "Vocab": try: with open(filename, "r") as fp: vocab_state = json.load(fp) vocab_options = vocab_state["options"] vocab_dict = vocab_state["vocab"] orig_vocab_dict = vocab_state["orig_vocab"] # restore the object # restore all the property values from the file max_num_tokens = vocab_options["max_num_words"] min_count = vocab_options["min_count"] unk_token = vocab_options["unk_token"] pad_token = vocab_options["pad_token"] start_token = vocab_options["start_token"] end_token = vocab_options["end_token"] special_token_freq = vocab_options["special_token_freq"] store_location = filename vocab = cls( max_num_tokens=max_num_tokens, min_count=min_count, unk_token=unk_token, pad_token=pad_token, start_token=start_token, end_token=end_token, instances=None, special_token_freq=special_token_freq, store_location=store_location, ) # instead of building the vocab, set the vocab from vocab_dict vocab.set_vocab(vocab=vocab_dict) vocab.set_orig_vocab(orig_vocab_dict) idx2token = vocab.get_idx2token_mapping() token2idx = vocab.get_token2idx_mapping() vocab.set_idx2token(idx2token) vocab.set_token2idx(token2idx) return vocab except FileNotFoundError: print("You passed {0} for the filename. Please check whether " "the path exists and try again. Please pass " "a json file".format(filename)) def get_token_from_idx(self, idx: int) -> str: if not self.vocab: raise ValueError("Please build the vocab first") if not self.idx2token: self.idx2token = self.get_idx2token_mapping() vocab_len = self.get_vocab_len() if idx > vocab_len - 1: message = ( f"You tried to access idx {idx} of the vocab The length of the vocab is " f"{vocab_len}. Please Provide Number between 0 and {vocab_len - 1}" ) raise ValueError(message) token = self.idx2token.get(idx) return token def get_idx_from_token(self, token: str) -> int: if not self.vocab: raise ValueError("Please build the vocab first") if not self.token2idx: self.token2idx = self.get_token2idx_mapping() try: return self.token2idx[token] except KeyError: return self.token2idx.get(self.unk_token, None) def get_topn_frequent_words(self, n: int = 5) -> List[Tuple[str, int]]: idx2token = self.idx2token token_freqs = [] max_n = min(len(self.special_vocab) + n, self.get_vocab_len()) for idx in range(len(self.special_vocab), max_n): token = idx2token[idx] freq = self.orig_vocab[token][0] token_freqs.append((token, freq)) return token_freqs def print_stats(self) -> None: orig_vocab_len = self.get_orig_vocab_len() vocab_len = self.get_vocab_len() N = 5 top_n = self.get_topn_frequent_words(n=N) data = [ ("Original vocab length", orig_vocab_len), ("Clipped vocab length", vocab_len), ("Top {0} words".format(N), top_n), ] header = ("Stats Description", "#") table_string = wasabi.table(data=data, header=header, divider=True) self.msg_printer.divider("VOCAB STATS") print(table_string) def set_vocab(self, vocab: Dict[str, Tuple[int, int]]): self.vocab = vocab def set_orig_vocab(self, orig_vocab: Dict[str, Tuple[int, int]]): self.orig_vocab = orig_vocab def set_idx2token(self, idx2token: Dict[int, str]): self.idx2token = idx2token def set_token2idx(self, token2idx: Dict[str, int]): self.token2idx = token2idx def get_disp_sentence_from_indices(self, indices: List[int]) -> str: """ Given a set of indices in vocab, it returns a sentence mapping the index to string Parameters ---------- indices : List[int] A list of indices where every index is between ``[0, vocab_len-1)``. Returns ------- str A string representing the index """ if self.special_vocab: pad_token_index = self.get_idx_from_token(self.pad_token) start_token_index = self.get_idx_from_token(self.start_token) end_token_index = self.get_idx_from_token(self.end_token) special_indices = [ pad_token_index, start_token_index, end_token_index ] else: special_indices = [] token = [ self.get_token_from_idx(idx) for idx in indices if idx not in special_indices ] sentence = " ".join(token) return sentence @property def token2idx(self): return self._token2idx @token2idx.setter def token2idx(self, value): self._token2idx = value @property def idx2token(self): return self._idx2token @idx2token.setter def idx2token(self, value): self._idx2token = value
def train_evaluate(self): # check GPU spacy.util.fix_random_seed(0) is_using_gpu = spacy.prefer_gpu() if is_using_gpu: torch.set_default_tensor_type('torch.cuda.FloatTensor') print('GPU usage') # GPUtil.showUtilization() print('Loading model...') nlp = spacy.load(self.model_name) print(nlp.pipe_names) print('Loaded model {}'.format(self.model_name)) if self.model_name == 'en_trf_bertbaseuncased_lg' or 'en_trf_xlnetbasecased_lg': textcat = nlp.create_pipe( "trf_textcat", config={"architecture": "softmax_class_vector"}) else: raise ValueError('Choose a supported transformer!') # Add labels to text classifier textcat.add_label('POSITIVE') textcat.add_label('NEGATIVE') nlp.add_pipe(textcat, last=True) if not self.pos_label: # if the positive label is not defined pos_label = 'POSITIVE' logger.info('Labels:', textcat.labels) logger.info('Positive label for evaluation:', self.pos_label) print('Loading data...') self.train_path = False self.dev_path = False if self.train_path and self.dev_path: # using own datasets try: train_texts, train_cats = ClassificationDataReader( )._prepare_partition(self.train_path) dev_texts, dev_cats = ClassificationDataReader( )._prepare_partition(self.dev_path) except ValueError: print('Data path is not valid!') else: # using IMDB data here (train_texts, train_cats), (dev_texts, dev_cats) = ClassificationDataReader().load_data() # raise ValueError('No valid data path!') print('Using {} training docs, {} evaluations'.format( len(train_texts), len(dev_texts))) logger.info('Using {} training docs, {} evaluations'.format( len(train_texts), len(dev_texts))) split_training_by_sentence = False # if split_training_by_sentence: # if we are using a model that averages over sentence predictions # train_texts, train_cats = make_sentence_examples(nlp, train_texts, train_cats) total_words = sum(len(text.split()) for text in train_texts) train_data = list( zip(train_texts, [{ 'categories': cats } for cats in train_cats])) # Initialize the TextCategorizer, and create an optimizer optimizer = nlp.resume_training() optimizer.alpha = self.alpha optimizer.trf_weight_decay = self.weight_decay optimizer.L2 = self.l2 lrs = cyclic_triangular_rate(self.lr / 3, self.lr * 3, 2 * len(train_data) // self.batch_size) print('Training the model...') logger.info('Training the model...') pbar = tqdm.tqdm(total=100, leave=False) # 100 expected iterations results = [] epoch = 0 step = 0 while True: # train and evaluate losses = Counter() random.shuffle(train_data) batches = minibatch(train_data, size=self.batch_size) for batch in batches: optimizer.trf_lr = next(lrs) texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=self.dropout_rate) pbar.update(1) if step and (step % self.eval_every) == 0: pbar.close() with nlp.use_params(optimizer.averages): # averages ?? scores = Evaluate.f1_evaluate(nlp, dev_texts, dev_cats, pos_label) # Add score to results results.append((scores['textcat_f'], step, epoch)) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'.format( losses['trf_textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) pbar = tqdm.tqdm(total=self.eval_every, leave=False) step += 1 epoch += 1 if results: # Stop if no improvement within patience checkpoints best_score, best_step, best_epoch = max(results) if (step - best_step) // self.eval_every >= self.patience: break # Print messages msg = Printer() msg.info('Best scoring checkpoints') table_widths = [2, 4, 6] msg.row(['Epoch', 'Step', 'Score'], widths=table_widths) msg.row(['-' * w for w in table_widths]) for score, step, epoch in sorted(results, reverse=True)[:10]: msg.row([epoch, step, '%.2f' % (score * 100)], widths=table_widths) logger.info('Epoch {}; Step {}; Score {}'.format( *(epoch, step, '%.2f' % (score * 100)))) # Test the trained model test_text = dev_texts[0] doc = nlp(test_text) logger.info('The tested text is {}, the prediction is {}'.format( test_text, doc.cats)) print(test_text, doc.cats) # Save the model if self.output_path is not None: nlp.to_disk(self.output_path) print('Save model to', self.output_path) print('Test the saved model') print('Loading from', self.output_path) nlp2 = spacy.load(self.output_path) doc2 = nlp2(test_text) logger.info( 'The tested text is {}, the prediction is {}'.format( test_text, doc2.cats)) print(test_text, doc2.cats)
class Engine(ClassNursery): def __init__( self, model: nn.Module, datasets_manager: DatasetsManager, optimizer: optim, batch_size: int, save_dir: str, num_epochs: int, save_every: int, log_train_metrics_every: int, train_metric: BaseMetric, validation_metric: BaseMetric, test_metric: BaseMetric, experiment_name: Optional[str] = None, experiment_hyperparams: Optional[Dict[str, Any]] = None, tensorboard_logdir: str = None, track_for_best: str = "loss", collate_fn=list, device: Union[torch.device, str] = torch.device("cpu"), gradient_norm_clip_value: Optional[float] = 5.0, lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, use_wandb: bool = False, sample_proportion: float = 1.0, seeds: Dict[str, int] = None, ): """ Engine runs the models end to end. It iterates through the train dataset and passes it through the model. During training it helps in tracking a lot of parameters for the run and saving the parameters. It also reports validation and test parameters from time to time. Many utilities required for end-end running of the model is here. Parameters ---------- model : nn.Module A pytorch module defining a model to be run datasets_manager : DatasetsManager A datasets manager that handles all the different datasets optimizer : torch.optim Any Optimizer object instantiated using ``torch.optim`` batch_size : int Batch size for the dataset. The same batch size is used for ``train``, ``valid`` and ``test`` dataset save_dir : int The experiments are saved in ``save_dir``. We save checkpoints, the best model, logs and other information into the save dir num_epochs : int The number of epochs to run the training save_every : int The model will be checkpointed every ``save_every`` number of iterations log_train_metrics_every : int The train metrics will be reported every ``log_train_metrics_every`` iterations during training train_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating training metrics validation_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating validation metrics test_metric : BaseMetric Anything that is an instance of ``BaseMetric`` for calculating test metrics experiment_name : str The experiment should be given a name for ease of tracking. Instead experiment name is not given, we generate a unique 10 digit sha for the experiment. experiment_hyperparams : Dict[str, Any] This is mostly used for tracking the different hyper-params of the experiment being run. This may be used by ``wandb`` to save the hyper-params tensorboard_logdir : str The directory where all the tensorboard runs are stored. If ``None`` is passed then it defaults to the tensorboard default of storing the log in the current directory. track_for_best : str Which metric should be tracked for deciding the best model?. Anything that the metric emits and is a single value can be used for tracking. The defauly value is ``loss``. If its loss, then the best value will be the lowest one. For some other metrics like ``macro_fscore``, the best metric might be the one that has the highest value collate_fn : Callable[[List[Any]], List[Any]] Collates the different examples into a single batch of examples. This is the same terminology adopted from ``pytorch``. There is no different device : torch.device The device on which the model will be placed. If this is "cpu", then the model and the tensors will all be on cpu. If this is "cuda:0", then the model and the tensors will be placed on cuda device 0. You can mention any other cuda device that is suitable for your environment gradient_norm_clip_value : float To avoid gradient explosion, the gradients of the norm will be clipped if the gradient norm exceeds this value lr_scheduler : torch.optim.lr_scheduler Any pytorch ``lr_scheduler`` can be used for reducing the learning rate if the performance on the validation set reduces. use_wandb : bool wandb or weights and biases is a tool that is used to track experiments online. Sciwing comes with inbuilt functionality to track experiments on weights and biases seeds: Dict[str, int] The dict of seeds to be set. Set the random_seed, pytorch_seed and numpy_seed Found in https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py """ if isinstance(device, str): device = torch.device(device) if seeds is None: seeds = {} self.seeds = seeds self._set_seeds() self.model = model self.datasets_manager = datasets_manager self.train_dataset = self.datasets_manager.train_dataset self.validation_dataset = self.datasets_manager.dev_dataset self.test_dataset = self.datasets_manager.test_dataset self.optimizer = optimizer self.batch_size = batch_size self.save_dir = pathlib.Path(save_dir) self.num_epochs = num_epochs self.msg_printer = Printer() self.save_every = save_every self.log_train_metrics_every = log_train_metrics_every self.tensorboard_logdir = tensorboard_logdir self.train_metric_calc = train_metric self.validation_metric_calc = validation_metric self.test_metric_calc = test_metric self.summaryWriter = SummaryWriter(log_dir=tensorboard_logdir) self.track_for_best = track_for_best self.collate_fn = collate_fn self.device = device self.best_track_value = None self.set_best_track_value(self.best_track_value) self.gradient_norm_clip_value = gradient_norm_clip_value self.lr_scheduler = lr_scheduler self.lr_scheduler_is_plateau = isinstance( self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) self.use_wandb = wandb and use_wandb self.sample_proportion = sample_proportion self.label_namespaces = self.datasets_manager.label_namespaces self.datasets_manager.print_stats() if experiment_name is None: hash_ = hashlib.sha1() hash_.update(str(time.time()).encode("utf-8")) digest = hash_.hexdigest() experiment_name = digest[:10] self.experiment_name = experiment_name self.experiment_hyperparams = experiment_hyperparams or {} if self.use_wandb: wandb.init( project="project-scwing", name=self.experiment_name, config=self.experiment_hyperparams, ) if not self.save_dir.is_dir(): self.save_dir.mkdir(parents=True) with open(self.save_dir.joinpath("hyperparams.json"), "w") as fp: json.dump(self.experiment_hyperparams, fp) self.num_workers = 1 self.model.to(self.device) self.train_loader = self.get_loader(self.train_dataset) self.validation_loader = self.get_loader(self.validation_dataset) self.test_loader = self.get_loader(self.test_dataset) # refresh the iters at the beginning of every epoch self.train_iter = None self.validation_iter = None self.test_iter = None # initializing loss meters self.train_loss_meter = LossMeter() self.validation_loss_meter = LossMeter() self.msg_printer.divider("ENGINE STARTING") time.sleep(3) # get the loggers ready self.train_log_filename = self.save_dir.joinpath("train.log") self.validation_log_filename = self.save_dir.joinpath("validation.log") self.test_log_filename = self.save_dir.joinpath("test.log") self.train_logger = logzero.setup_logger( name="train-logger", logfile=self.train_log_filename, level=logging.INFO) self.validation_logger = logzero.setup_logger( name="valid-logger", logfile=self.validation_log_filename, level=logging.INFO, ) self.test_logger = logzero.setup_logger(name="test-logger", logfile=self.test_log_filename, level=logging.INFO) if self.lr_scheduler_is_plateau: if self.best_track_value == "loss" and self.lr_scheduler.mode == "max": self.msg_printer.warn( "You are optimizing loss and lr schedule mode is max instead of min" ) if (self.best_track_value == "macro_fscore" or self.best_track_value == "fscore" and self.lr_scheduler.mode == "min"): self.msg_printer.warn( f"You are optimizing for macro_fscore and lr scheduler mode is min instead of max" ) if (self.best_track_value == "micro_fscore" and self.lr_scheduler.mode == "min"): self.msg_printer.warn( f"You are optimizing for micro_fscore and lr scheduler mode is min instead of max" ) def get_loader(self, dataset: Dataset) -> DataLoader: """ Returns the DataLoader for the Dataset Parameters ---------- dataset : Dataset Returns ------- DataLoader A pytorch DataLoader """ dataset_size = len(dataset) sample_size = int(np.floor(dataset_size * self.sample_proportion)) indices = np.random.choice(range(dataset_size), size=sample_size, replace=False) sampler = SubsetRandomSampler(indices=indices) loader = DataLoader( dataset=dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=True, sampler=sampler, ) return loader def is_best_lower(self, current_best=None): """ Returns True if the current value of the metric is lower than the best metric. This is useful for tracking metrics like loss where, lower the value, the better it is Parameters ---------- current_best : float The current value for the metric that is being tracked Returns ------- bool """ return True if current_best < self.best_track_value else False def is_best_higher(self, current_best=None): """ Returns ``True`` if the current value of the metric is HIGHER than the best metric. This is useful for tracking metrics like FSCORE where, higher the value, the better it is Parameters ---------- current_best : float The current value for the metric that is being tracked Returns ------- bool """ return True if current_best >= self.best_track_value else False def set_best_track_value(self, current_best=None): """ Set the best value of the value being tracked Parameters ---------- current_best : float The current value that is best Returns ------- """ if self.track_for_best == "loss": self.best_track_value = np.inf if current_best is None else current_best elif self.track_for_best == "macro_fscore" or self.track_for_best == "fscore": self.best_track_value = 0 if current_best is None else current_best elif self.track_for_best == "micro_fscore": self.best_track_value = 0 if current_best is None else current_best def run(self): """ Run the engine :return: """ for epoch_num in range(self.num_epochs): self.train_epoch(epoch_num) self.validation_epoch(epoch_num) self.test_epoch(epoch_num) def train_epoch(self, epoch_num: int): """ Run the training for one epoch :param epoch_num: type: int The current epoch number """ # refresh everything necessary before training begins num_iterations = 0 train_iter = self.get_iter(self.train_loader) self.train_loss_meter.reset() self.train_metric_calc.reset() self.model.train() self.msg_printer.info( f"Starting Training Epoch: {epoch_num+1}/{self.num_epochs}") while True: try: # N*T, N * 1, N * 1 lines_labels = next(train_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] batch_size = len(lines) model_forward_out = self.model( lines=lines, labels=labels, is_training=True, is_validation=False, is_test=False, ) self.train_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) try: self.optimizer.zero_grad() loss = model_forward_out["loss"] loss.backward() torch.nn.utils.clip_grad_norm_( self.model.parameters(), max_norm=self.gradient_norm_clip_value) self.optimizer.step() self.train_loss_meter.add_loss(loss.item(), batch_size) except KeyError: self.msg_printer.fail( "The model output dictionary does not have " "a key called loss. Please check to have " "loss in the model output") num_iterations += 1 if (num_iterations + 1) % self.log_train_metrics_every == 0: metrics = self.train_metric_calc.report_metrics() for label_namespace, table in metrics.items(): self.msg_printer.divider( text=f"Train Metrics for {label_namespace.upper()}" ) print(table) except StopIteration: self.train_epoch_end(epoch_num) break def train_epoch_end(self, epoch_num: int): """ Performs house-keeping at the end of a training epoch At the end of the training epoch, it does some house-keeping. It reports the average loss, the average metric and other information. Parameters ---------- epoch_num : int The current epoch number (0 based) """ self.msg_printer.divider(f"Training end @ Epoch {epoch_num + 1}") average_loss = self.train_loss_meter.get_average() self.msg_printer.text("Average Loss: {0}".format(average_loss)) self.train_logger.info( f"Average loss @ Epoch {epoch_num+1} - {average_loss}") metric = self.train_metric_calc.get_metric() if self.use_wandb: wandb.log({"train_loss": average_loss}, step=epoch_num + 1) if self.track_for_best != "loss": for label_namespace in self.label_namespaces: wandb.log( { f"train_{self.track_for_best}_{label_namespace}": metric[label_namespace][self.track_for_best] }, step=epoch_num + 1, ) # save the model after every `self.save_every` epochs if (epoch_num + 1) % self.save_every == 0: torch.save( { "epoch_num": epoch_num, "optimizer_state": self.optimizer.state_dict(), "model_state": self.model.state_dict(), "loss": average_loss, }, self.save_dir.joinpath(f"model_epoch_{epoch_num+1}.pt"), ) # log loss to tensor board self.summaryWriter.add_scalars( "train_validation_loss", {"train_loss": average_loss or np.inf}, epoch_num + 1, ) def validation_epoch(self, epoch_num: int): """ Runs one validation epoch on the validation dataset Parameters ---------- epoch_num : int 0-based epoch number """ self.model.eval() valid_iter = iter(self.validation_loader) self.validation_loss_meter.reset() self.validation_metric_calc.reset() self.msg_printer.info( f"Starting Validation Epoch: {epoch_num + 1}/{self.num_epochs}") while True: try: lines_labels = next(valid_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] batch_size = len(lines) with torch.no_grad(): model_forward_out = self.model( lines=lines, labels=labels, is_training=False, is_validation=True, is_test=False, ) loss = model_forward_out["loss"] self.validation_loss_meter.add_loss(loss, batch_size) self.validation_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) except StopIteration: self.validation_epoch_end(epoch_num) break def validation_epoch_end(self, epoch_num: int): """Performs house-keeping at the end of validation epoch Parameters ---------- epoch_num : int The current epoch number """ self.msg_printer.divider(f"Validation @ Epoch {epoch_num+1}") metric_report = self.validation_metric_calc.report_metrics() average_loss = self.validation_loss_meter.get_average() for label_namespace, table in metric_report.items(): self.msg_printer.divider( text=f"Validation Metrics for {label_namespace.upper()}") print(table) self.msg_printer.text(f"Average Loss: {average_loss}") self.validation_logger.info( f"Validation Loss @ Epoch {epoch_num+1} - {average_loss}") if self.use_wandb: wandb.log({"validation_loss": average_loss}, step=epoch_num + 1) metric = self.validation_metric_calc.get_metric() if self.track_for_best != "loss": for label_namespace in self.label_namespaces: wandb.log( { f"validation_{self.track_for_best}_{label_namespace}": metric[label_namespace][self.track_for_best] }, step=epoch_num + 1, ) self.summaryWriter.add_scalars( "train_validation_loss", {"validation_loss": average_loss or np.inf}, epoch_num + 1, ) is_best: bool = None value_tracked: str = None if self.track_for_best == "loss": value_tracked = average_loss is_best = self.is_best_lower(average_loss) elif (self.track_for_best == "micro_fscore" or self.track_for_best == "macro_fscore" or self.track_for_best == "fscore"): # If there are multiple namespaces for the metric # we decide the best model based on the average score values_tracked = [] metrics = self.validation_metric_calc.get_metric() for label_namespace in self.label_namespaces: value_tracked = metrics[label_namespace][self.track_for_best] values_tracked.append(value_tracked) value_tracked = sum(values_tracked) / len(values_tracked) is_best = self.is_best_higher(current_best=value_tracked) if self.lr_scheduler is not None: self.lr_scheduler.step(value_tracked) if is_best: self.set_best_track_value(current_best=value_tracked) self.msg_printer.good(f"Found Best Model @ epoch {epoch_num + 1}") torch.save( { "epoch_num": epoch_num, "optimizer_state": self.optimizer.state_dict(), "model_state": self.model.state_dict(), "loss": average_loss, }, self.save_dir.joinpath("best_model.pt"), ) def test_epoch(self, epoch_num: int): """Runs the test epoch for ``epoch_num`` Loads the best model that is saved during the training and runs the test dataset. Parameters ---------- epoch_num : int zero based epoch number for which the test dataset is run This is after the last training epoch. """ self.msg_printer.divider("Running on Test Batch") self.load_model_from_file(self.save_dir.joinpath("best_model.pt")) self.model.eval() test_iter = iter(self.test_loader) while True: try: lines_labels = next(test_iter) lines_labels = list(zip(*lines_labels)) lines = lines_labels[0] labels = lines_labels[1] with torch.no_grad(): model_forward_out = self.model( lines=lines, labels=labels, is_training=False, is_validation=False, is_test=True, ) self.test_metric_calc.calc_metric( lines=lines, labels=labels, model_forward_dict=model_forward_out) except StopIteration: self.test_epoch_end(epoch_num) break def test_epoch_end(self, epoch_num: int): """ Performs house-keeping at the end of the test epoch It reports the metric that is being traced at the end of the test epoch Parameters ---------- epoch_num : int Epoch num after which the test dataset is run """ metric_report = self.test_metric_calc.report_metrics() for label_namespace, table in metric_report.items(): self.msg_printer.divider( text=f"Test Metrics for {label_namespace.upper()}") print(table) precision_recall_fmeasure = self.test_metric_calc.get_metric() self.msg_printer.divider(f"Test @ Epoch {epoch_num+1}") self.test_logger.info( f"Test Metrics @ Epoch {epoch_num+1} - {precision_recall_fmeasure}" ) if self.use_wandb: wandb.log({"test_metrics": str(precision_recall_fmeasure)}) self.summaryWriter.close() def get_train_dataset(self): """ Returns the train dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.train_dataset def get_validation_dataset(self): """ Returns the validation dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.validation_dataset def get_test_dataset(self): """ Returns the test dataset of the experiment Returns ------- Dataset Anything that conforms to the pytorch style dataset. """ return self.test_dataset @staticmethod def get_iter(loader: DataLoader) -> Iterator: """ Returns the iterator for a pytorch data loader. The ``loader`` is a pytorch DataLoader that iterates over the dataset in batches and employs many strategies to do so. We want an iterator that returns the dataset in batches. The end of the iterator would signify the end of an epoch and then we can use that information to perform house-keeping. Parameters ---------- loader : DataLoader a pytorch data loader Returns ------- Iterator An iterator over the data loader """ iterator = iter(loader) return iterator def load_model_from_file(self, filename: str): self.msg_printer.divider("LOADING MODEL FROM FILE") with self.msg_printer.loading( f"Loading Pytorch Model from file {filename}"): model_chkpoint = torch.load(filename) self.msg_printer.good("Finished Loading the Model") model_state = model_chkpoint["model_state"] self.model.load_state_dict(model_state) def _set_seeds(self): seed = self.seeds.get("random_seed", 17290) numpy_seed = self.seeds.get("numpy_seed", 1729) torch_seed = self.seeds.get("pytorch_seed", 172) if seed is not None: random.seed(seed) if numpy_seed is not None: np.random.seed(numpy_seed) if torch_seed is not None: torch.manual_seed(torch_seed) # Seed all GPUs with the same seed if available. if torch.cuda.is_available(): torch.cuda.manual_seed_all(torch_seed)
def convert( input_file, output_dir="-", file_type="json", n_sents=1, seg_sents=False, model=None, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json """ no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) input_data = input_path.open("r", encoding="utf-8").read() if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner": msg.info("Auto-detected token-per-line NER format") converter = converter_autodetect elif converter_autodetect == "iob": msg.info("Auto-detected sentence-per-line NER format") converter = converter_autodetect else: msg.warn( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] data = func( input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model, no_print=no_print, ) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path( input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents): {}".format( len(data), output_file)) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)