Exemple #1
0
def train_model(
    model,
    train_path,
    eval_path,
    n_iter=10,
    output=None,
    tok2vec=None,
):
    """
    Train a model from Prodigy annotations and optionally save out the best
    model to disk.
    """
    spacy.util.fix_random_seed(0)
    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    ner = nlp.create_pipe("ner")
    for label in labels:
        ner.add_label(label)
    nlp.add_pipe(ner)
    t2v_cfg = {
        "embed_rows": 10000,
        "token_vector_width": 128,
        "conv_depth": 8,
        "nr_feature_tokens": 3,
    }
    optimizer = nlp.begin_training(
        component_cfg={"ner": t2v_cfg} if tok2vec else {})
    if tok2vec:
        _load_pretrained_tok2vec(nlp, Path(tok2vec))
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8, 8, 8)
    msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            texts, annots = zip(*batch)
            nlp.update(texts, annots, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            sc = nlp.evaluate(eval_data)
            if sc.ents_f > best_acc:
                best_acc = sc.ents_f
                if output:
                    best_model = nlp.to_bytes()
        acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
        msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
Exemple #2
0
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        depr_patterns_path = path.with_suffix(".jsonl")
        if depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        else:
            cfg = {}
            deserializers = {
                "patterns":
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl"))),
                "cfg":
                lambda p: cfg.update(srsly.read_json(p)),
            }
            from_disk(path, deserializers, {})
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)

            if self.phrase_matcher_attr is not None:
                self.phrase_matcher = PhraseMatcher(
                    self.nlp.vocab, attr=self.phrase_matcher_attr)
        return self
Exemple #3
0
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        if path.is_file():
            patterns = srsly.read_jsonl(path)
            self.add_patterns(patterns)
        else:
            cfg = {}
            deserializers = {
                'patterns':
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix('.jsonl'))),
                'cfg':
                lambda p: cfg.update(srsly.read_json(p))
            }
            from_disk(path, deserializers, {})
            self.overwrite = cfg.get('overwrite', False)
            self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP)
        return self
def combine():
    total_he = 0
    total_combined = 0
    he_data = srsly.read_jsonl(f"{DATA_LOC}/he_mentions.jsonl")
    en_data = srsly.read_jsonl(f"{DATA_LOC}/en_mentions.jsonl")
    he_ref_map = defaultdict(list)
    en_ref_map = defaultdict(list)
    for he_row in he_data:
        he_ref_map[he_row["Ref"]] += [he_row]
    for en_row in en_data:
        en_ref_map[en_row["Ref"]] += [en_row]
    combined_data = []
    missing_data = []
    for tref, he_rows in he_ref_map.items():
        en_rows = en_ref_map[he_rows[0]["Ref"]]
        he_ids = {int(he_row["Bonayich ID"]) for he_row in he_rows}
        new_row = {
            "Book":
            he_rows[0]["Book"],
            "Ref":
            he_rows[0]["Ref"],
            "He Mentions": [{
                "Start": he_row["Start"],
                "End": he_row["End"],
                "Bonayich ID": int(he_row["Bonayich ID"]),
                "Mention": he_row["Mention"]
            } for he_row in he_rows],
            "En Mentions": [{
                "Start":
                en_row["Start"],
                "End":
                en_row["End"],
                "Bonayich ID":
                int(en_row["Bonayich ID"])
                if en_row["Bonayich ID"] is not None else None,
                "Mention":
                en_row["Mention"]
            } for en_row in en_rows],
        }
        new_row["En Mentions Filtered"] = list(
            filter(lambda x: x["Bonayich ID"] in he_ids,
                   new_row["En Mentions"]))
        en_filtered_ids = {
            int(he_row["Bonayich ID"])
            for he_row in new_row["En Mentions Filtered"]
        }
        new_row["He Mentions Filtered"] = list(
            filter(lambda x: x["Bonayich ID"] in en_filtered_ids,
                   new_row["He Mentions"]))
        total_he += len(new_row["He Mentions"])
        total_combined += len(new_row["En Mentions Filtered"])
        if len(new_row["He Mentions"]) > len(new_row["En Mentions Filtered"]):
            missing_data += [new_row]
        combined_data += [new_row]
    srsly.write_jsonl(f"{DATA_LOC}/combined_mentions.jsonl", combined_data)
    with open(f"{DATA_LOC}/missing_mentions.jsonl", "w") as fout:
        json.dump(missing_data, fout, ensure_ascii=False, indent=2)
    print(total_he, total_combined)
Exemple #5
0
def train_model(model,
                train_path,
                eval_path,
                n_iter=10,
                output="./model2/",
                tok2vec=None):
    spacy.util.fix_random_seed(0)

    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat")

        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")
    for label in labels:
        textcat.add_label(label)
    optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True})
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8)
    msg.row(("#", "L", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            #texts = [text for text, entities in batch]

            #annotations = [entities for text, entities in batch]
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            scorer = nlp.evaluate(eval_data)
            if scorer.textcat_score > best_acc:
                best_acc = scorer.textcat_score
                if output:
                    best_model = nlp.to_bytes()
        acc = f"{scorer.textcat_score:.3f}"
        msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
Exemple #6
0
def guess_most_likely_transliteration():
    """
    Creates a CSV mapping spellings of rabbis in English to most common spellings in Hebrew
    """
    prefixes = "בכ|וב|וה|וכ|ול|ומ|וש|כב|ככ|כל|כמ|כש|לכ|מב|מה|מכ|מל|מש|שב|שה|שכ|של|שמ|ב|כ|ל|מ|ש|ה|ו|ד".split('|')
    en_ents = srsly.read_jsonl('./research/prodigy/output/evaluation_results/talmud_en.jsonl')
    he_segs_by_ref = {ent['ref']: ent for ent in srsly.read_jsonl('./research/prodigy/output/evaluation_results/talmud_he.jsonl')}
    en_to_he_spellings = defaultdict(lambda: defaultdict(int))
    en_to_refs = defaultdict(list)
    for en_seg in en_ents:
        for start, end, _ in en_seg['tp']:
            en_mention = en_seg['text'][start:end]
            en_to_refs[en_mention] += [en_seg['ref']]
            he_seg = he_segs_by_ref[en_seg['ref']]
            for he_start, he_end, _ in he_seg['tp']:
                he_mention = he_seg['text'][he_start:he_end]
                en_to_he_spellings[en_mention][he_mention] += 1
    out_rows = []
    max_he = 0

    # reptitious but I'm lazy. calculate total he count first
    total_he_count = defaultdict(int)
    for en_mention, he_mention_count in en_to_he_spellings.items():
        for he_mention, count in he_mention_count.items():
            total_he_count[he_mention] += count
            for prefix in prefixes:
                if he_mention.startswith(prefix):
                    total_he_count[he_mention[len(prefix):]] += count
    # then use it to normalize counts
    for en_mention, he_mention_count in en_to_he_spellings.items():
        sans_prefix_count = defaultdict(int)
        for he_mention, count in he_mention_count.items():
            sans_prefix_count[he_mention] += count
            for prefix in prefixes:
                if he_mention.startswith(prefix):
                    sans_prefix_count[he_mention[len(prefix):]] += count
        for he_mention, count in sans_prefix_count.items():
            sans_prefix_count[he_mention] = math.log(count) - math.log(total_he_count[he_mention])
        best_hebrew = sorted(sans_prefix_count.items(), key=lambda x: x[1], reverse=True)[:5]
        out_row = {
            "En": en_mention,
            "Example Refs": " | ".join(en_to_refs[en_mention][:10])
        }
        for i, (he, count) in enumerate(best_hebrew):
            out_row[f"He {i+1}"] = he
            if i > max_he:
                max_he = i
        out_rows += [out_row]
    with open('/home/nss/sefaria/datasets/ner/sefaria/yerushalmi_title_possibilities.csv', 'w') as fout:
        cout = csv.DictWriter(fout, ['En', 'Example Refs'] + [f'He {i+1}' for i in range(max_he+1)])
        cout.writeheader()
        cout.writerows(out_rows)
Exemple #7
0
    def from_disk(self: SpaczzRuler, path: Union[str, Path],
                  **kwargs: Any) -> SpaczzRuler:
        """Load the spaczz ruler from a file.

        Expects a file containing newline-delimited JSON (JSONL)
        with one entry per line.

        Args:
            path: The JSONL file to load.
            **kwargs: Other config paramters, mostly for consistency.

        Returns:
            The loaded spaczz ruler.

        Example:
            >>> import os
            >>> import tempfile
            >>> import spacy
            >>> from spaczz.pipeline import SpaczzRuler
            >>> nlp = spacy.blank("en")
            >>> ruler = SpaczzRuler(nlp)
            >>> ruler.add_patterns([{"label": "AUTHOR", "pattern": "Kerouac",
                "type": "fuzzy"}])
            >>> with tempfile.TemporaryDirectory() as tmpdir:
            >>>     ruler.to_disk(f"{tmpdir}/ruler")
            >>>     new_ruler = SpaczzRuler(nlp)
            >>>     new_ruler = new_ruler.from_disk(f"{tmpdir}/ruler")
            >>> "AUTHOR" in new_ruler
            True
        """
        path = ensure_path(path)
        depr_patterns_path = path.with_suffix(".jsonl")
        if depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        else:
            cfg = {}
            deserializers_patterns = {
                "spaczz_patterns":
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl")))
            }
            deserializers_cfg = {
                "cfg": lambda p: cfg.update(srsly.read_json(p))
            }
            read_from_disk(path, deserializers_cfg, {})
            self.overwrite = cfg.get("spaczz_overwrite", False)
            self.defaults = cfg.get("spaczz_defaults", {})
            self.ent_id_sep = cfg.get("spaczz_ent_id_sep", DEFAULT_ENT_ID_SEP)
            read_from_disk(path, deserializers_patterns, {})
        return self
def interpret_file(path, encoding='utf-8', readers: dict = None):
    """Read a file's using the proper loader from the extension"""
    path = Path(path).expanduser().resolve()
    s = path.suffix.lower()
    if readers is None:
        readers = {}
    elif not isinstance(readers, dict):
        assert callable(readers)
        readers = {s: readers}
    if s in readers:
        func = readers[s]
        assert callable(func)
        return func(path)
    elif s == '.json':
        return srsly.read_json(path)
    elif s == '.jsonl':
        return srsly.read_jsonl(path)
    elif s in ('.yml', '.yaml'):
        # return yaml.load(path.read_bytes(), Loader=YamlLoader)
        return yaml.load(path.read_bytes())
    elif s in ('.pkl', '.bin', '.pickle'):
        return srsly.pickle_loads(path.read_text(encoding=encoding))
    elif s not in _TEXT_EXT:
        return path.read_bytes()
    else:
        return path.read_text(encoding=encoding)
Exemple #9
0
def convert(json_path, output):
    db = DocBin()
    for line in srsly.read_jsonl(json_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output)
def init_vocab(
    nlp: "Language",
    *,
    data: Optional[Path] = None,
    lookups: Optional[Lookups] = None,
    vectors: Optional[str] = None,
) -> "Language":
    if lookups:
        nlp.vocab.lookups = lookups
        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
    data_path = ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
        for lexeme in nlp.vocab:
            lexeme.rank = OOV_RANK
        for attrs in lex_attrs:
            if "settings" in attrs:
                continue
            lexeme = nlp.vocab[attrs["orth"]]
            lexeme.set_attrs(**attrs)
        if len(nlp.vocab):
            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
        else:
            oov_prob = DEFAULT_OOV_PROB
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
    logger.info("Created vocabulary")
    if vectors is not None:
        load_vectors_into_model(nlp, vectors)
        logger.info(f"Added vectors: {vectors}")
    logger.info("Finished initializing nlp object")
Exemple #11
0
def convert_jsonl_to_csv(filename):
    j = srsly.read_jsonl(filename)
    rows = []
    for d in j:
        algo_guesses = {(s, e) for s, e, _ in (d['fp'] + d['tp'])}
        false_negs = {(s, e) for s, e, _ in d['fn']}
        all_algo_inds = set()
        for start, end in algo_guesses:
            all_algo_inds |= set(range(start, end))
        missed_tags = set()
        for start, end in false_negs:
            temp_inds = set(range(start, end))
            if len(temp_inds & all_algo_inds) == 0:
                missed_tags.add((start, end))
        for algo_missed, temp_data in zip(['n', 'y'], [algo_guesses, missed_tags]):
            for start, end in temp_data:
                before, after = get_window_around_match(start, end, d['text'])
                match = d['text'][start:end]
                rows += [{
                    "Before": before,
                    "After": after,
                    "Citation": match,
                    "Algorithm Missed": algo_missed
                }]

    with open(filename[:-5] + '.csv', 'w') as fout:
        c = csv.DictWriter(fout, ['Type','Correct?', 'Algorithm Missed','After', 'Citation', 'Before'])
        c.writeheader()
        c.writerows(rows)
def test_git_extract_ignore_errors(mock_load_diffs):
    mock_load_diffs.side_effect = ValueError("simulated error")

    with tempfile.TemporaryDirectory() as tmpdir:
        runner = CliRunner()
        result = runner.invoke(
            cli,
            [
                "extract",
                "git-repo",
                "--customer-id=test",
                "--source-id=test",
                "--output-dir=" + tmpdir,
                "--forced-repo-name=test-repo",
                "--log-level=debug",
                "--log-to-file",
                "--ignore-errors",
                "--use-non-native-repo-db",
                ".",
            ],
            catch_exceptions=False,
        )
        assert result.exit_code == 0, result.output

        commits_file = [f for f in os.listdir(tmpdir) if "git_commits" in f][0]
        assert (list(srsly.read_jsonl(os.path.join(
            tmpdir,
            commits_file,
        ))) == [])
def test_ingest_repo_to_jsonl(mock_name_getter):
    mock_name_getter.return_value = "repo-name"

    with tempfile.TemporaryDirectory() as tmpdir:
        git.ingest_repo_to_jsonl("customer-id",
                                 "source-id",
                                 ".",
                                 branch="master",
                                 output_dir=tmpdir)

        files = [f for f in os.listdir(tmpdir)]
        assert len(files) == 3

        for file_name in files:
            assert file_name.startswith(f"customer-id__source-id__")

            data = list(srsly.read_jsonl(os.path.join(tmpdir, file_name)))
            assert len(data)

            if git.GIT_REPO_TYPE in file_name:
                assert data[0]["name"] == "repo-name"
            elif git.GIT_COMMIT_TYPE in file_name:
                assert data[0]["tm_id"].startswith("gic")
            elif git.GIT_COMMIT_DIFF_TYPE in file_name:
                assert data[0]["tm_id"].startswith("gdf")
            else:
                pytest.fail("unexpected file type")
Exemple #14
0
def parse_annotation(path_to_annotation):
    annotation = srsly.read_jsonl(path_to_annotation)

    spans = []
    texts = []

    for entry in annotation:
        texts.append(entry["text"])
        temp = []
        if "spans" in entry:
            for span in entry["spans"]:
                temp.append([span["start"], span["end"], span["label"]])
        spans.append(temp)

    #build list of unique entries and list of empty annotation dictionaries for each
    annot_ls = []

    for text in texts:
        annot_ls.append({"entities": []})

    #populate annotation dictionaries
    for i in range(len(texts)):
        for span in spans[i]:
            annot_ls[i]["entities"].append(
                (int(span[0]), int(span[1]), span[2]))

    #build list of tuples
    tuples = []
    for i in range(len(texts)):
        tuples.append((texts[i], annot_ls[i]))

    return tuples
Exemple #15
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # Reset truncated/skipped counts
        self._source_max_truncated = 0
        self._source_max_skipped = 0
        self._num_images_skipped = 0

        lines = srsly.read_jsonl(file_path)
        lines = list(self.shard_iterable(lines))

        if self.produce_featurized_images:
            filenames = [
                ntpath.basename(info_dict["img"]) for info_dict in lines
            ]
            image_paths = [self.images[filename] for filename in filenames]
        else:
            image_paths = [None] * len(lines)

        batch = []
        for index, (line, image_path) in enumerate(zip(lines, image_paths)):
            text = line["text"]
            line_is_valid = self._validate_line(text, image_path)
            if not line_is_valid:
                continue
            batch.append((image_path, text, line.get("label", None), line))
            if len(batch) == self._batch_size or index == len(lines) - 1:
                # It would be much easier to just process one image at a time, but it's faster to process
                # them in batches. So this code gathers up instances until it has enough to fill up a batch
                # that needs processing, and then processes them all.
                batch_image_paths = [item[0] for item in batch]
                if batch_image_paths == [None] * len(batch_image_paths):
                    processed_images = batch_image_paths  # all nones
                else:
                    processed_images = self._process_image_paths(
                        batch_image_paths, use_cache=self._use_cache)

                for item, processed_image in zip(batch, processed_images):
                    yield self.text_to_instance(
                        image=processed_image,
                        text=item[1],
                        label=item[2],
                        metadata=item[3],
                    )

                # initialize batch items for next batch
                batch = []

        if self._source_max_tokens and (self._source_max_truncated
                                        or self._source_max_skipped):
            logger.info(
                "In %d instances, the source token length exceeded the max limit (%d) and were %s.",
                self._source_max_truncated
                if self._truncate_long_sequences else self._source_max_skipped,
                self._source_max_tokens,
                "truncated" if self._truncate_long_sequences else "skipped",
            )
        if self._num_images_skipped:
            logger.info(
                "In %d instances, the image was non RGB image.",
                self._num_images_skipped,
            )
Exemple #16
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for line in srsly.read_jsonl(input_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Exemple #17
0
def convert(output_path):
    global nlp
    db = DocBin()
    for line in srsly.read_jsonl("db.json"):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Exemple #18
0
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
    # Mostly used for backwards-compatibility and may be removed in the future
    lex_attrs = srsly.read_jsonl(jsonl_loc)
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)
def convert_training_to_displacy(jsonl_loc):
    out = []
    for text, tags in srsly.read_jsonl(jsonl_loc):
        out += [{
            'text': text,
            'ents': sorted([{'start': s, 'end': e, 'label': l} for s, e, l in tags['entities']], key=lambda x: x['start'])
        }]
    srsly.write_jsonl(jsonl_loc + '.displacy', out)
Exemple #20
0
    def from_disk(
        self,
        path: Union[str, Path],
        *,
        exclude: Iterable[str] = SimpleFrozenList()) -> "EntityRuler":
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (str / Path): The JSONL file to load.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        self.clear()
        depr_patterns_path = path.with_suffix(".jsonl")
        if path.suffix == ".jsonl":  # user provides a jsonl
            if path.is_file:
                patterns = srsly.read_jsonl(path)
                self.add_patterns(patterns)
            else:
                raise ValueError(Errors.E1023.format(path=path))
        elif depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        elif path.is_dir():  # path is a valid directory
            cfg = {}
            deserializers_patterns = {
                "patterns":
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl")))
            }
            deserializers_cfg = {
                "cfg": lambda p: cfg.update(srsly.read_json(p))
            }
            from_disk(path, deserializers_cfg, {})
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)

            self.phrase_matcher = PhraseMatcher(self.nlp.vocab,
                                                attr=self.phrase_matcher_attr)
            from_disk(path, deserializers_patterns, {})
        else:  # path is not a valid directory or file
            raise ValueError(Errors.E146.format(path=path))
        return self
Exemple #21
0
def init_model(
    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
            if freqs_loc:
                settings.append("-f")
            if clusters_loc:
                settings.append("-c")
            msg.warn(
                "Incompatible arguments",
                "The -f and -c arguments are deprecated, and not compatible "
                "with the -j argument, which should specify the same "
                "information. Either merge the frequencies and clusters data "
                "into the JSONL-formatted file (recommended), or use only the "
                "-f and -c files, without the other lexical attributes.",
            )
        jsonl_loc = ensure_path(jsonl_loc)
        lex_attrs = srsly.read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors,
                    vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
Exemple #22
0
def create_data(cfg: Config) -> Tuple[InputData, InputData]:
    data = list(srsly.read_jsonl(Path(cfg.path).expanduser()))
    if cfg.ndata > 0:
        data = random.sample(data, k=cfg.ndata)
    else:
        cfg.ndata = len(data)
    train, val = train_test_split(data, test_size=cfg.val_size)
    srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train)
    srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val)
    return train, val
Exemple #23
0
def evaluate_model(model, eval_path):
    """
    Evaluate a trained model on Prodigy annotations and print the accuracy.
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    data, _ = format_data(srsly.read_jsonl(eval_path))
    sc = nlp.evaluate(data)
    result = [("F-Score", f"{sc.textcat_score:.3f}")]
    msg.table(result)
Exemple #24
0
 def func():
     nlp = spacy_load("en_core_web_sm")
     doc = nlp(text_path.read_text())
     matcher = Matcher(doc.vocab)
     patterns = [
         p["pattern"] if "pattern" in p else p
         for p in read_jsonl(patterns_path)
     ]
     matcher.add("Profile", patterns)
     matcher(doc)
Exemple #25
0
def read_gold_data(nlp, gold_loc):
    docs = []
    golds = []
    for json_obj in srsly.read_jsonl(gold_loc):
        doc = nlp.make_doc(json_obj["text"])
        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
        gold = GoldParse(doc, entities=ents)
        docs.append(doc)
        golds.append(gold)
    return list(zip(docs, golds))
Exemple #26
0
def read_gold_data(nlp, gold_loc):
    docs = []
    golds = []
    for json_obj in srsly.read_jsonl(gold_loc):
        doc = nlp.make_doc(json_obj["text"])
        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
        gold = GoldParse(doc, entities=ents)
        docs.append(doc)
        golds.append(gold)
    return list(zip(docs, golds))
Exemple #27
0
def init_model(
    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    prune_vectors=-1,
):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
            if freqs_loc:
                settings.append("-f")
            if clusters_loc:
                settings.append("-c")
            msg.warn(
                "Incompatible arguments",
                "The -f and -c arguments are deprecated, and not compatible "
                "with the -j argument, which should specify the same "
                "information. Either merge the frequencies and clusters data "
                "into the JSONL-formatted file (recommended), or use only the "
                "-f and -c files, without the other lexical attributes.",
            )
        jsonl_loc = ensure_path(jsonl_loc)
        lex_attrs = srsly.read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs)
    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, prune_vectors)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
Exemple #28
0
def read_json(path: Path) -> List[Example]:
    """Read annotations in JSON file format
    
    Args:
        path (Path): Path to data
    
    Returns:
        List[Example]: List of examples
    """
    data = srsly.read_jsonl(path)
    examples = json_to_examples(data)
    return examples
Exemple #29
0
def _main(cfg):
    cfg = parse(cfg)
    if cfg.seed:
        set_seed(cfg.seed)
    org_cwd = hydra.utils.get_original_cwd()
    logger.info(cfg.pretty())
    nlp = cast(TorchLanguage, create_model(cfg.model))
    train_data = list(
        srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.train)))
    cfg.train.data.ndata = len(train_data)
    val_data = list(srsly.read_jsonl(os.path.join(org_cwd,
                                                  cfg.train.data.val)))
    logger.info("output dir: {}".format(os.getcwd()))
    if torch.cuda.is_available():
        logger.info("CUDA enabled")
        nlp.to(torch.device("cuda"))
    savedir = Path.cwd() / "models"
    srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train_data)
    srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val_data)
    savedir.mkdir(exist_ok=True)
    train(cfg.train, nlp, train_data, val_data, savedir)
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path))
    for doc, eg in nlp.pipe(data_tuples, as_tuples=True):
        # doc.cats = {category: 0 for category in CATEGORIES}
        doc.cats[eg["label"]] = 1
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemple #31
0
def load_data(filepath):
    examples = list(srsly.read_jsonl(filepath))
    rows = []
    n_total_ents = 0
    n_no_ents = 0
    labels = set()
    for eg in examples:
        row = {"text": eg["text"], "ents": eg.get("spans", [])}
        n_total_ents += len(row["ents"])
        if not row["ents"]:
            n_no_ents += 1
        labels.update([span["label"] for span in row["ents"]])
        rows.append(row)
    return rows, labels, n_total_ents, n_no_ents
Exemple #32
0
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        patterns = srsly.read_jsonl(path)
        self.add_patterns(patterns)
        return self
Exemple #33
0
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        patterns = srsly.read_jsonl(path)
        self.add_patterns(patterns)
        return self
Exemple #34
0
def _load_file(file_path, msg):
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        with msg.loading("Loading {}...".format(file_name)):
            data = srsly.read_json(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    elif file_path.suffix == ".jsonl":
        with msg.loading("Loading {}...".format(file_name)):
            data = srsly.read_jsonl(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    msg.fail(
        "Can't load file extension {}".format(file_path.suffix),
        "Expected .json or .jsonl",
        exits=1,
    )
Exemple #35
0
def read_raw_data(nlp, jsonl_loc):
    for json_obj in srsly.read_jsonl(jsonl_loc):
        if json_obj["text"].strip():
            doc = nlp.make_doc(json_obj["text"])
            yield doc
Exemple #36
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))

    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemple #37
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)