Ejemplo n.º 1
0
def create_dirs(package_path, force):
    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            prints(package_path, Messages.M045, title=Messages.M044, exits=1)
    Path.mkdir(package_path, parents=True)
Ejemplo n.º 2
0
def generate_meta(model_path, existing_meta):
    meta = existing_meta or {}
    settings = [('lang', 'Model language', meta.get('lang', 'en')),
                ('name', 'Model name', meta.get('name', 'model')),
                ('version', 'Model version', meta.get('version', '0.0.0')),
                ('spacy_version', 'Required spaCy version',
                 '>=%s,<3.0.0' % about.__version__),
                ('description', 'Model description',
                 meta.get('description', False)),
                ('author', 'Author', meta.get('author', False)),
                ('email', 'Author email', meta.get('email', False)),
                ('url', 'Author website', meta.get('url', False)),
                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
    nlp = util.load_model_from_path(Path(model_path))
    meta['pipeline'] = nlp.pipe_names
    meta['vectors'] = {
        'width': nlp.vocab.vectors_length,
        'vectors': len(nlp.vocab.vectors),
        'keys': nlp.vocab.vectors.n_keys
    }
    prints(Messages.M047, title=Messages.M046)
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
    if about.__title__ != 'spacy':
        meta['parent_package'] = about.__title__
    return meta
Ejemplo n.º 3
0
def create_model(lang, probs, oov_prob, vectors_data, vector_keys,
                 expand_vectors, prune_vectors):
    print("Creating model...")
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
    lex_added = 0
    for i, (word, prob) in enumerate(
            tqdm(sorted(probs.items(), key=lambda item: item[1],
                        reverse=True))):
        lexeme = nlp.vocab[word]
        lexeme.rank = i
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        lexeme.cluster = 0
        lex_added += 1
    nlp.vocab.cfg.update({'oov_prob': oov_prob})
    if vector_keys is not None:
        new_keys = []
        new_indices = []
        for i, word in enumerate(vector_keys):
            if word not in nlp.vocab and expand_vectors:
                lexeme = nlp.vocab[word]
                lexeme.is_oov = False
                lex_added += 1
            elif word in nlp.vocab and not expand_vectors:
                new_keys.append(word)
                new_indices.append(i)

        if len(vectors_data):
            if expand_vectors:
                nlp.vocab.vectors = Vectors(data=vectors_data,
                                            keys=vector_keys)
            else:
                nlp.vocab.vectors = Vectors(data=vectors_data[new_indices],
                                            keys=new_keys)

        if prune_vectors >= 1:
            nlp.vocab.prune_vectors(prune_vectors)
    vec_added = len(nlp.vocab.vectors)
    prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
           title=Messages.M038)
    return nlp
Ejemplo n.º 4
0
def init_model(lang,
               output_dir,
               freqs_loc=None,
               vectors_loc=None,
               no_expand_vectors=False,
               meta_overrides=None,
               prune_vectors=-1,
               min_word_frequency=50):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
    """
    output_dir = ensure_path(output_dir)
    if vectors_loc is not None:
        vectors_loc = cached_path(vectors_loc)
        vectors_loc = ensure_path(vectors_loc)
    if freqs_loc is not None:
        freqs_loc = cached_path(freqs_loc)
        freqs_loc = ensure_path(freqs_loc)

    if freqs_loc is not None and not freqs_loc.exists():
        prints(freqs_loc, title=Messages.M037, exits=1)
    probs, oov_prob = read_freqs(
        freqs_loc,
        min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (
        None, None)
    nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys,
                       not no_expand_vectors, prune_vectors)

    # Insert our custom tokenizer into the base model.
    nlp.tokenizer = combined_rule_tokenizer(nlp)

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)
        nlp.meta["version"] = VERSION

    if not output_dir.exists():
        os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    return nlp
Ejemplo n.º 5
0
if __name__ == '__main__':
    import plac
    import sys
    from spacy.cli import download, link, info, nlp, package, train, convert
    from spacy.cli import vocab, init_model, profile, evaluate, validate
    from spacy.util import prints

    commands = {
        'download': download,
        'link': link,
        'info': info,
        'nlp': nlp,
        'train': train,
        'evaluate': evaluate,
        'convert': convert,
        'package': package,
        'vocab': vocab,
        'init-model': init_model,
        'profile': profile,
        'validate': validate
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
    sys.argv[0] = 'spacy %s' % command
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
        prints("Available: %s" % ', '.join(commands),
               title="Unknown command: %s" % command,
               exits=1)
Ejemplo n.º 6
0
def package(input_dir,
            output_dir,
            meta_path=None,
            create_meta=False,
            force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over.
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title=Messages.M008, exits=1)
    if not output_path or not output_path.exists():
        prints(output_path, title=Messages.M040, exits=1)
    if meta_path and not meta_path.exists():
        prints(meta_path, title=Messages.M020, exits=1)

    meta_path = meta_path or input_path / 'meta.json'
    if meta_path.is_file():
        meta = util.read_json(meta_path)
        if not create_meta:  # only print this if user doesn't want to overwrite
            prints(meta_path, title=Messages.M041)
        else:
            meta = generate_meta(input_dir, meta)
    meta = validate_meta(meta, ['lang', 'name', 'version'])
    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
    main_path = output_path / model_name_v
    package_path = main_path / model_name
    bin_path = main_path / 'bin'
    include_path = main_path / 'include'
    orig_nc_path = Path(__file__).parent.parent
    nc_path = package_path / 'neuralcoref'

    create_dirs(package_path, force)
    create_dirs(bin_path, force)
    create_dirs(nc_path, force)

    shutil.copytree(path2str(input_path),
                    path2str(package_path / model_name_v))

    orig_include_path = path2str(Path(__file__).parent / 'include')
    shutil.copytree(path2str(orig_include_path), path2str(include_path))

    nc1_path = path2str(orig_nc_path / 'neuralcoref.pyx')
    nc2_path = path2str(orig_nc_path / 'neuralcoref.pxd')
    shutil.copyfile(path2str(nc1_path), path2str(nc_path / 'neuralcoref.pyx'))
    shutil.copyfile(path2str(nc2_path), path2str(nc_path / 'neuralcoref.pxd'))
    create_file(nc_path / '__init__.py', TEMPLATE_INIT_NC)
    create_file(nc_path / '__init__.pxd', TEMPLATE_INIT_PXD)

    orig_bin_path = path2str(
        Path(__file__).parent.parent.parent / 'bin' / 'cythonize.py')
    shutil.copyfile(path2str(orig_bin_path),
                    path2str(bin_path / 'cythonize.py'))

    create_file(main_path / 'meta.json', json_dumps(meta))
    create_file(main_path / 'setup.py', TEMPLATE_SETUP)
    create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
    create_file(package_path / '__init__.py', TEMPLATE_INIT.format(model_name))
    create_file(package_path / '__init__.pxd', TEMPLATE_INIT_PXD)
    prints(main_path,
           Messages.M043,
           title=Messages.M042.format(name=model_name_v))
Ejemplo n.º 7
0
def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
    return meta
Ejemplo n.º 8
0
    import plac
    import sys
    from spacy.cli import download, link, info, package, train, convert
    from spacy.cli import vocab, init_model, profile, evaluate, validate
    from spacy.util import prints

    commands = {
        'download': download,
        'link': link,
        'info': info,
        'train': train,
        'evaluate': evaluate,
        'convert': convert,
        'package': package,
        'vocab': vocab,
        'init-model': init_model,
        'profile': profile,
        'validate': validate
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
    sys.argv[0] = 'spacy %s' % command
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
        prints(
            "Available: %s" % ', '.join(commands),
            title="Unknown command: %s" % command,
            exits=1)
Ejemplo n.º 9
0
def train(pretrained,
          output_dir,
          train_data,
          dev_data,
          n_iter=30,
          n_sents=0,
          parser_multitasks='',
          entity_multitasks='',
          use_gpu=-1,
          no_tagger=False,
          no_parser=False,
          no_entities=False,
          gold_preproc=False,
          version="0.0.0",
          meta_path=None,
          verbose=False):
    """
    Re-train a pre-trained model. Expects data in spaCy's JSON
    format. This code is based on
    https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py.
    """
    # There is a bug that prevents me from using the GPU when resuming
    # training from a saved model. See
    # https://github.com/explosion/spaCy/issues/1806.
    if use_gpu >= 0:
        msg = "\nWARNING: using GPU may require re-installing thinc. "
        msg += "See https://github.com/explosion/spaCy/issues/1806.\n"
        print(msg)

    util.fix_random_seed()
    util.set_env_log(True)
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title=Messages.M050, exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title=Messages.M051, exits=1)
    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title=Messages.M020, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints(Messages.M053.format(meta_type=type(meta)),
               title=Messages.M052,
               exits=1)

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

    # Load pre-trained model. Remove components that we are not
    # re-training.
    nlp = load(pretrained)
    if no_tagger and 'tagger' in nlp.pipe_names:
        nlp.remove_pipe('tagger')
    if no_parser and 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    if no_entities and 'ner' in nlp.pipe_names:
        nlp.remove_pipe('ner')
    meta.setdefault('name', 'unnamed')
    meta['pipeline'] = nlp.pipe_names
    meta.setdefault('lang', nlp.lang)
    nlp.meta.update(meta)

    # Add multi-task objectives
    if parser_multitasks:
        for objective in parser_multitasks.split(','):
            nlp.parser.add_multitask_objective(objective)
    if entity_multitasks:
        for objective in entity_multitasks.split(','):
            nlp.entity.add_multitask_objective(objective)

    # Get optimizer
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

    print(nlp.pipe_names)
    print(nlp.pipeline)

    print(
        "Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS"
    )
    try:
        train_docs = corpus.train_docs(nlp,
                                       projectivize=True,
                                       noise_level=0.0,
                                       gold_preproc=gold_preproc,
                                       max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch
                             if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               drop=next(dropout_rates),
                               losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                dev_docs = list(
                    corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
                scorer = nlp_loaded.evaluate(dev_docs, verbose)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
                    cpu_wps = nwords / (end_time - start_time)
                else:
                    gpu_wps = nwords / (end_time - start_time)
                    with Model.use_device('cpu'):
                        nlp_loaded = util.load_model_from_path(
                            epoch_model_path)
                        dev_docs = list(
                            corpus.dev_docs(nlp_loaded,
                                            gold_preproc=gold_preproc))
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
                        cpu_wps = nwords / (end_time - start_time)
                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['speed'] = {
                    'nwords': nwords,
                    'cpu': cpu_wps,
                    'gpu': gpu_wps
                }
                meta['vectors'] = {
                    'width': nlp.vocab.vectors_length,
                    'vectors': len(nlp.vocab.vectors),
                    'keys': nlp.vocab.vectors.n_keys
                }
                meta['lang'] = nlp.lang
                meta['pipeline'] = nlp.pipe_names
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)

                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
            print_progress(i,
                           losses,
                           scorer.scores,
                           cpu_wps=cpu_wps,
                           gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / 'model-final'
            nlp.to_disk(final_model_path)