def create_dirs(package_path, force): if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: prints(package_path, Messages.M045, title=Messages.M044, exits=1) Path.mkdir(package_path, parents=True)
def generate_meta(model_path, existing_meta): meta = existing_meta or {} settings = [('lang', 'Model language', meta.get('lang', 'en')), ('name', 'Model name', meta.get('name', 'model')), ('version', 'Model version', meta.get('version', '0.0.0')), ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), ('description', 'Model description', meta.get('description', False)), ('author', 'Author', meta.get('author', False)), ('email', 'Author email', meta.get('email', False)), ('url', 'Author website', meta.get('url', False)), ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] nlp = util.load_model_from_path(Path(model_path)) meta['pipeline'] = nlp.pipe_names meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } prints(Messages.M047, title=Messages.M046) for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response if about.__title__ != 'spacy': meta['parent_package'] = about.__title__ return meta
def create_model(lang, probs, oov_prob, vectors_data, vector_keys, expand_vectors, prune_vectors): print("Creating model...") lang_class = get_lang_class(lang) nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = 0 lex_added = 0 for i, (word, prob) in enumerate( tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): lexeme = nlp.vocab[word] lexeme.rank = i lexeme.prob = prob lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx lexeme.cluster = 0 lex_added += 1 nlp.vocab.cfg.update({'oov_prob': oov_prob}) if vector_keys is not None: new_keys = [] new_indices = [] for i, word in enumerate(vector_keys): if word not in nlp.vocab and expand_vectors: lexeme = nlp.vocab[word] lexeme.is_oov = False lex_added += 1 elif word in nlp.vocab and not expand_vectors: new_keys.append(word) new_indices.append(i) if len(vectors_data): if expand_vectors: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) else: nlp.vocab.vectors = Vectors(data=vectors_data[new_indices], keys=new_keys) if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) vec_added = len(nlp.vocab.vectors) prints(Messages.M039.format(entries=lex_added, vectors=vec_added), title=Messages.M038) return nlp
def init_model(lang, output_dir, freqs_loc=None, vectors_loc=None, no_expand_vectors=False, meta_overrides=None, prune_vectors=-1, min_word_frequency=50): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. """ output_dir = ensure_path(output_dir) if vectors_loc is not None: vectors_loc = cached_path(vectors_loc) vectors_loc = ensure_path(vectors_loc) if freqs_loc is not None: freqs_loc = cached_path(freqs_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): prints(freqs_loc, title=Messages.M037, exits=1) probs, oov_prob = read_freqs( freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20) vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else ( None, None) nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors) # Insert our custom tokenizer into the base model. nlp.tokenizer = combined_rule_tokenizer(nlp) if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) nlp.meta["version"] = VERSION if not output_dir.exists(): os.makedirs(output_dir, exist_ok=True) nlp.to_disk(output_dir) return nlp
if __name__ == '__main__': import plac import sys from spacy.cli import download, link, info, nlp, package, train, convert from spacy.cli import vocab, init_model, profile, evaluate, validate from spacy.util import prints commands = { 'download': download, 'link': link, 'info': info, 'nlp': nlp, 'train': train, 'evaluate': evaluate, 'convert': convert, 'package': package, 'vocab': vocab, 'init-model': init_model, 'profile': profile, 'validate': validate } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) command = sys.argv.pop(1) sys.argv[0] = 'spacy %s' % command if command in commands: plac.call(commands[command], sys.argv[1:]) else: prints("Available: %s" % ', '.join(commands), title="Unknown command: %s" % command, exits=1)
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): prints(input_path, title=Messages.M008, exits=1) if not output_path or not output_path.exists(): prints(output_path, title=Messages.M040, exits=1) if meta_path and not meta_path.exists(): prints(meta_path, title=Messages.M020, exits=1) meta_path = meta_path or input_path / 'meta.json' if meta_path.is_file(): meta = util.read_json(meta_path) if not create_meta: # only print this if user doesn't want to overwrite prints(meta_path, title=Messages.M041) else: meta = generate_meta(input_dir, meta) meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] main_path = output_path / model_name_v package_path = main_path / model_name bin_path = main_path / 'bin' include_path = main_path / 'include' orig_nc_path = Path(__file__).parent.parent nc_path = package_path / 'neuralcoref' create_dirs(package_path, force) create_dirs(bin_path, force) create_dirs(nc_path, force) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) orig_include_path = path2str(Path(__file__).parent / 'include') shutil.copytree(path2str(orig_include_path), path2str(include_path)) nc1_path = path2str(orig_nc_path / 'neuralcoref.pyx') nc2_path = path2str(orig_nc_path / 'neuralcoref.pxd') shutil.copyfile(path2str(nc1_path), path2str(nc_path / 'neuralcoref.pyx')) shutil.copyfile(path2str(nc2_path), path2str(nc_path / 'neuralcoref.pxd')) create_file(nc_path / '__init__.py', TEMPLATE_INIT_NC) create_file(nc_path / '__init__.pxd', TEMPLATE_INIT_PXD) orig_bin_path = path2str( Path(__file__).parent.parent.parent / 'bin' / 'cythonize.py') shutil.copyfile(path2str(orig_bin_path), path2str(bin_path / 'cythonize.py')) create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', TEMPLATE_SETUP) create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) create_file(package_path / '__init__.py', TEMPLATE_INIT.format(model_name)) create_file(package_path / '__init__.pxd', TEMPLATE_INIT_PXD) prints(main_path, Messages.M043, title=Messages.M042.format(name=model_name_v))
def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints(Messages.M049, title=Messages.M048.format(key=key), exits=1) return meta
import plac import sys from spacy.cli import download, link, info, package, train, convert from spacy.cli import vocab, init_model, profile, evaluate, validate from spacy.util import prints commands = { 'download': download, 'link': link, 'info': info, 'train': train, 'evaluate': evaluate, 'convert': convert, 'package': package, 'vocab': vocab, 'init-model': init_model, 'profile': profile, 'validate': validate } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) command = sys.argv.pop(1) sys.argv[0] = 'spacy %s' % command if command in commands: plac.call(commands[command], sys.argv[1:]) else: prints( "Available: %s" % ', '.join(commands), title="Unknown command: %s" % command, exits=1)
def train(pretrained, output_dir, train_data, dev_data, n_iter=30, n_sents=0, parser_multitasks='', entity_multitasks='', use_gpu=-1, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, version="0.0.0", meta_path=None, verbose=False): """ Re-train a pre-trained model. Expects data in spaCy's JSON format. This code is based on https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py. """ # There is a bug that prevents me from using the GPU when resuming # training from a saved model. See # https://github.com/explosion/spaCy/issues/1806. if use_gpu >= 0: msg = "\nWARNING: using GPU may require re-installing thinc. " msg += "See https://github.com/explosion/spaCy/issues/1806.\n" print(msg) util.fix_random_seed() util.set_env_log(True) n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title=Messages.M050, exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title=Messages.M051, exits=1) if meta_path is not None and not meta_path.exists(): prints(meta_path, title=Messages.M020, exits=1) meta = util.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): prints(Messages.M053.format(meta_type=type(meta)), title=Messages.M052, exits=1) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) max_doc_len = util.env_opt('max_doc_len', 5000) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() # Load pre-trained model. Remove components that we are not # re-training. nlp = load(pretrained) if no_tagger and 'tagger' in nlp.pipe_names: nlp.remove_pipe('tagger') if no_parser and 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') if no_entities and 'ner' in nlp.pipe_names: nlp.remove_pipe('ner') meta.setdefault('name', 'unnamed') meta['pipeline'] = nlp.pipe_names meta.setdefault('lang', nlp.lang) nlp.meta.update(meta) # Add multi-task objectives if parser_multitasks: for objective in parser_multitasks.split(','): nlp.parser.add_multitask_objective(objective) if entity_multitasks: for objective in entity_multitasks.split(','): nlp.entity.add_multitask_objective(objective) # Get optimizer optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None print(nlp.pipe_names) print(nlp.pipeline) print( "Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" ) try: train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, gold_preproc=gold_preproc, max_length=0) train_docs = list(train_docs) for i in range(n_iter): with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(train_docs, size=batch_sizes): batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len] if not batch: continue docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device('cpu'): nlp_loaded = util.load_model_from_path( epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores meta['speed'] = { 'nwords': nwords, 'cpu': cpu_wps, 'gpu': gpu_wps } meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } meta['lang'] = nlp.lang meta['pipeline'] = nlp.pipe_names meta['spacy_version'] = '>=%s' % about.__version__ meta.setdefault('name', 'model%d' % i) meta.setdefault('version', version) with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: print("Saving model...") with nlp.use_params(optimizer.averages): final_model_path = output_path / 'model-final' nlp.to_disk(final_model_path)