def train_model( model, train_path, eval_path, n_iter=10, output=None, tok2vec=None, ): """ Train a model from Prodigy annotations and optionally save out the best model to disk. """ spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) ner = nlp.create_pipe("ner") for label in labels: ner.add_label(label) nlp.add_pipe(ner) t2v_cfg = { "embed_rows": 10000, "token_vector_width": 128, "conv_depth": 8, "nr_feature_tokens": 3, } optimizer = nlp.begin_training( component_cfg={"ner": t2v_cfg} if tok2vec else {}) if tok2vec: _load_pretrained_tok2vec(nlp, Path(tok2vec)) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8, 8, 8) msg.row(("#", "L", "P", "R", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): texts, annots = zip(*batch) nlp.update(texts, annots, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): sc = nlp.evaluate(eval_data) if sc.ents_f > best_acc: best_acc = sc.ents_f if output: best_model = nlp.to_bytes() acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}") msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def from_disk(self, path, **kwargs): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) depr_patterns_path = path.with_suffix(".jsonl") if depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) else: cfg = {} deserializers = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl"))), "cfg": lambda p: cfg.update(srsly.read_json(p)), } from_disk(path, deserializers, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) if self.phrase_matcher_attr is not None: self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr) return self
def from_disk(self, path, **kwargs): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) if path.is_file(): patterns = srsly.read_jsonl(path) self.add_patterns(patterns) else: cfg = {} deserializers = { 'patterns': lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix('.jsonl'))), 'cfg': lambda p: cfg.update(srsly.read_json(p)) } from_disk(path, deserializers, {}) self.overwrite = cfg.get('overwrite', False) self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP) return self
def combine(): total_he = 0 total_combined = 0 he_data = srsly.read_jsonl(f"{DATA_LOC}/he_mentions.jsonl") en_data = srsly.read_jsonl(f"{DATA_LOC}/en_mentions.jsonl") he_ref_map = defaultdict(list) en_ref_map = defaultdict(list) for he_row in he_data: he_ref_map[he_row["Ref"]] += [he_row] for en_row in en_data: en_ref_map[en_row["Ref"]] += [en_row] combined_data = [] missing_data = [] for tref, he_rows in he_ref_map.items(): en_rows = en_ref_map[he_rows[0]["Ref"]] he_ids = {int(he_row["Bonayich ID"]) for he_row in he_rows} new_row = { "Book": he_rows[0]["Book"], "Ref": he_rows[0]["Ref"], "He Mentions": [{ "Start": he_row["Start"], "End": he_row["End"], "Bonayich ID": int(he_row["Bonayich ID"]), "Mention": he_row["Mention"] } for he_row in he_rows], "En Mentions": [{ "Start": en_row["Start"], "End": en_row["End"], "Bonayich ID": int(en_row["Bonayich ID"]) if en_row["Bonayich ID"] is not None else None, "Mention": en_row["Mention"] } for en_row in en_rows], } new_row["En Mentions Filtered"] = list( filter(lambda x: x["Bonayich ID"] in he_ids, new_row["En Mentions"])) en_filtered_ids = { int(he_row["Bonayich ID"]) for he_row in new_row["En Mentions Filtered"] } new_row["He Mentions Filtered"] = list( filter(lambda x: x["Bonayich ID"] in en_filtered_ids, new_row["He Mentions"])) total_he += len(new_row["He Mentions"]) total_combined += len(new_row["En Mentions Filtered"]) if len(new_row["He Mentions"]) > len(new_row["En Mentions Filtered"]): missing_data += [new_row] combined_data += [new_row] srsly.write_jsonl(f"{DATA_LOC}/combined_mentions.jsonl", combined_data) with open(f"{DATA_LOC}/missing_mentions.jsonl", "w") as fout: json.dump(missing_data, fout, ensure_ascii=False, indent=2) print(total_he, total_combined)
def train_model(model, train_path, eval_path, n_iter=10, output="./model2/", tok2vec=None): spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") for label in labels: textcat.add_label(label) optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True}) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8) msg.row(("#", "L", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): #texts = [text for text, entities in batch] #annotations = [entities for text, entities in batch] texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(eval_data) if scorer.textcat_score > best_acc: best_acc = scorer.textcat_score if output: best_model = nlp.to_bytes() acc = f"{scorer.textcat_score:.3f}" msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def guess_most_likely_transliteration(): """ Creates a CSV mapping spellings of rabbis in English to most common spellings in Hebrew """ prefixes = "בכ|וב|וה|וכ|ול|ומ|וש|כב|ככ|כל|כמ|כש|לכ|מב|מה|מכ|מל|מש|שב|שה|שכ|של|שמ|ב|כ|ל|מ|ש|ה|ו|ד".split('|') en_ents = srsly.read_jsonl('./research/prodigy/output/evaluation_results/talmud_en.jsonl') he_segs_by_ref = {ent['ref']: ent for ent in srsly.read_jsonl('./research/prodigy/output/evaluation_results/talmud_he.jsonl')} en_to_he_spellings = defaultdict(lambda: defaultdict(int)) en_to_refs = defaultdict(list) for en_seg in en_ents: for start, end, _ in en_seg['tp']: en_mention = en_seg['text'][start:end] en_to_refs[en_mention] += [en_seg['ref']] he_seg = he_segs_by_ref[en_seg['ref']] for he_start, he_end, _ in he_seg['tp']: he_mention = he_seg['text'][he_start:he_end] en_to_he_spellings[en_mention][he_mention] += 1 out_rows = [] max_he = 0 # reptitious but I'm lazy. calculate total he count first total_he_count = defaultdict(int) for en_mention, he_mention_count in en_to_he_spellings.items(): for he_mention, count in he_mention_count.items(): total_he_count[he_mention] += count for prefix in prefixes: if he_mention.startswith(prefix): total_he_count[he_mention[len(prefix):]] += count # then use it to normalize counts for en_mention, he_mention_count in en_to_he_spellings.items(): sans_prefix_count = defaultdict(int) for he_mention, count in he_mention_count.items(): sans_prefix_count[he_mention] += count for prefix in prefixes: if he_mention.startswith(prefix): sans_prefix_count[he_mention[len(prefix):]] += count for he_mention, count in sans_prefix_count.items(): sans_prefix_count[he_mention] = math.log(count) - math.log(total_he_count[he_mention]) best_hebrew = sorted(sans_prefix_count.items(), key=lambda x: x[1], reverse=True)[:5] out_row = { "En": en_mention, "Example Refs": " | ".join(en_to_refs[en_mention][:10]) } for i, (he, count) in enumerate(best_hebrew): out_row[f"He {i+1}"] = he if i > max_he: max_he = i out_rows += [out_row] with open('/home/nss/sefaria/datasets/ner/sefaria/yerushalmi_title_possibilities.csv', 'w') as fout: cout = csv.DictWriter(fout, ['En', 'Example Refs'] + [f'He {i+1}' for i in range(max_he+1)]) cout.writeheader() cout.writerows(out_rows)
def from_disk(self: SpaczzRuler, path: Union[str, Path], **kwargs: Any) -> SpaczzRuler: """Load the spaczz ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. Args: path: The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. Returns: The loaded spaczz ruler. Example: >>> import os >>> import tempfile >>> import spacy >>> from spaczz.pipeline import SpaczzRuler >>> nlp = spacy.blank("en") >>> ruler = SpaczzRuler(nlp) >>> ruler.add_patterns([{"label": "AUTHOR", "pattern": "Kerouac", "type": "fuzzy"}]) >>> with tempfile.TemporaryDirectory() as tmpdir: >>> ruler.to_disk(f"{tmpdir}/ruler") >>> new_ruler = SpaczzRuler(nlp) >>> new_ruler = new_ruler.from_disk(f"{tmpdir}/ruler") >>> "AUTHOR" in new_ruler True """ path = ensure_path(path) depr_patterns_path = path.with_suffix(".jsonl") if depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) else: cfg = {} deserializers_patterns = { "spaczz_patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl"))) } deserializers_cfg = { "cfg": lambda p: cfg.update(srsly.read_json(p)) } read_from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("spaczz_overwrite", False) self.defaults = cfg.get("spaczz_defaults", {}) self.ent_id_sep = cfg.get("spaczz_ent_id_sep", DEFAULT_ENT_ID_SEP) read_from_disk(path, deserializers_patterns, {}) return self
def interpret_file(path, encoding='utf-8', readers: dict = None): """Read a file's using the proper loader from the extension""" path = Path(path).expanduser().resolve() s = path.suffix.lower() if readers is None: readers = {} elif not isinstance(readers, dict): assert callable(readers) readers = {s: readers} if s in readers: func = readers[s] assert callable(func) return func(path) elif s == '.json': return srsly.read_json(path) elif s == '.jsonl': return srsly.read_jsonl(path) elif s in ('.yml', '.yaml'): # return yaml.load(path.read_bytes(), Loader=YamlLoader) return yaml.load(path.read_bytes()) elif s in ('.pkl', '.bin', '.pickle'): return srsly.pickle_loads(path.read_text(encoding=encoding)) elif s not in _TEXT_EXT: return path.read_bytes() else: return path.read_text(encoding=encoding)
def convert(json_path, output): db = DocBin() for line in srsly.read_jsonl(json_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output)
def init_vocab( nlp: "Language", *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, ) -> "Language": if lookups: nlp.vocab.lookups = lookups logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) for lexeme in nlp.vocab: lexeme.rank = OOV_RANK for attrs in lex_attrs: if "settings" in attrs: continue lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs) if len(nlp.vocab): oov_prob = min(lex.prob for lex in nlp.vocab) - 1 else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) logger.info(f"Added vectors: {vectors}") logger.info("Finished initializing nlp object")
def convert_jsonl_to_csv(filename): j = srsly.read_jsonl(filename) rows = [] for d in j: algo_guesses = {(s, e) for s, e, _ in (d['fp'] + d['tp'])} false_negs = {(s, e) for s, e, _ in d['fn']} all_algo_inds = set() for start, end in algo_guesses: all_algo_inds |= set(range(start, end)) missed_tags = set() for start, end in false_negs: temp_inds = set(range(start, end)) if len(temp_inds & all_algo_inds) == 0: missed_tags.add((start, end)) for algo_missed, temp_data in zip(['n', 'y'], [algo_guesses, missed_tags]): for start, end in temp_data: before, after = get_window_around_match(start, end, d['text']) match = d['text'][start:end] rows += [{ "Before": before, "After": after, "Citation": match, "Algorithm Missed": algo_missed }] with open(filename[:-5] + '.csv', 'w') as fout: c = csv.DictWriter(fout, ['Type','Correct?', 'Algorithm Missed','After', 'Citation', 'Before']) c.writeheader() c.writerows(rows)
def test_git_extract_ignore_errors(mock_load_diffs): mock_load_diffs.side_effect = ValueError("simulated error") with tempfile.TemporaryDirectory() as tmpdir: runner = CliRunner() result = runner.invoke( cli, [ "extract", "git-repo", "--customer-id=test", "--source-id=test", "--output-dir=" + tmpdir, "--forced-repo-name=test-repo", "--log-level=debug", "--log-to-file", "--ignore-errors", "--use-non-native-repo-db", ".", ], catch_exceptions=False, ) assert result.exit_code == 0, result.output commits_file = [f for f in os.listdir(tmpdir) if "git_commits" in f][0] assert (list(srsly.read_jsonl(os.path.join( tmpdir, commits_file, ))) == [])
def test_ingest_repo_to_jsonl(mock_name_getter): mock_name_getter.return_value = "repo-name" with tempfile.TemporaryDirectory() as tmpdir: git.ingest_repo_to_jsonl("customer-id", "source-id", ".", branch="master", output_dir=tmpdir) files = [f for f in os.listdir(tmpdir)] assert len(files) == 3 for file_name in files: assert file_name.startswith(f"customer-id__source-id__") data = list(srsly.read_jsonl(os.path.join(tmpdir, file_name))) assert len(data) if git.GIT_REPO_TYPE in file_name: assert data[0]["name"] == "repo-name" elif git.GIT_COMMIT_TYPE in file_name: assert data[0]["tm_id"].startswith("gic") elif git.GIT_COMMIT_DIFF_TYPE in file_name: assert data[0]["tm_id"].startswith("gdf") else: pytest.fail("unexpected file type")
def parse_annotation(path_to_annotation): annotation = srsly.read_jsonl(path_to_annotation) spans = [] texts = [] for entry in annotation: texts.append(entry["text"]) temp = [] if "spans" in entry: for span in entry["spans"]: temp.append([span["start"], span["end"], span["label"]]) spans.append(temp) #build list of unique entries and list of empty annotation dictionaries for each annot_ls = [] for text in texts: annot_ls.append({"entities": []}) #populate annotation dictionaries for i in range(len(texts)): for span in spans[i]: annot_ls[i]["entities"].append( (int(span[0]), int(span[1]), span[2])) #build list of tuples tuples = [] for i in range(len(texts)): tuples.append((texts[i], annot_ls[i])) return tuples
def _read(self, file_path: str) -> Iterable[Instance]: # Reset truncated/skipped counts self._source_max_truncated = 0 self._source_max_skipped = 0 self._num_images_skipped = 0 lines = srsly.read_jsonl(file_path) lines = list(self.shard_iterable(lines)) if self.produce_featurized_images: filenames = [ ntpath.basename(info_dict["img"]) for info_dict in lines ] image_paths = [self.images[filename] for filename in filenames] else: image_paths = [None] * len(lines) batch = [] for index, (line, image_path) in enumerate(zip(lines, image_paths)): text = line["text"] line_is_valid = self._validate_line(text, image_path) if not line_is_valid: continue batch.append((image_path, text, line.get("label", None), line)) if len(batch) == self._batch_size or index == len(lines) - 1: # It would be much easier to just process one image at a time, but it's faster to process # them in batches. So this code gathers up instances until it has enough to fill up a batch # that needs processing, and then processes them all. batch_image_paths = [item[0] for item in batch] if batch_image_paths == [None] * len(batch_image_paths): processed_images = batch_image_paths # all nones else: processed_images = self._process_image_paths( batch_image_paths, use_cache=self._use_cache) for item, processed_image in zip(batch, processed_images): yield self.text_to_instance( image=processed_image, text=item[1], label=item[2], metadata=item[3], ) # initialize batch items for next batch batch = [] if self._source_max_tokens and (self._source_max_truncated or self._source_max_skipped): logger.info( "In %d instances, the source token length exceeded the max limit (%d) and were %s.", self._source_max_truncated if self._truncate_long_sequences else self._source_max_skipped, self._source_max_tokens, "truncated" if self._truncate_long_sequences else "skipped", ) if self._num_images_skipped: logger.info( "In %d instances, the image was non RGB image.", self._num_images_skipped, )
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) db = DocBin() for line in srsly.read_jsonl(input_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def convert(output_path): global nlp db = DocBin() for line in srsly.read_jsonl("db.json"): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: # Mostly used for backwards-compatibility and may be removed in the future lex_attrs = srsly.read_jsonl(jsonl_loc) for attrs in lex_attrs: if "settings" in attrs: continue lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs)
def convert_training_to_displacy(jsonl_loc): out = [] for text, tags in srsly.read_jsonl(jsonl_loc): out += [{ 'text': text, 'ents': sorted([{'start': s, 'end': e, 'label': l} for s, e, l in tags['entities']], key=lambda x: x['start']) }] srsly.write_jsonl(jsonl_loc + '.displacy', out)
def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (str / Path): The JSONL file to load. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) self.clear() depr_patterns_path = path.with_suffix(".jsonl") if path.suffix == ".jsonl": # user provides a jsonl if path.is_file: patterns = srsly.read_jsonl(path) self.add_patterns(patterns) else: raise ValueError(Errors.E1023.format(path=path)) elif depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) elif path.is_dir(): # path is a valid directory cfg = {} deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl"))) } deserializers_cfg = { "cfg": lambda p: cfg.update(srsly.read_json(p)) } from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) self.phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=self.phrase_matcher_attr) from_disk(path, deserializers_patterns, {}) else: # path is not a valid directory or file raise ValueError(Errors.E146.format(path=path)) return self
def init_model( lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, vectors_loc=None, truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] if freqs_loc: settings.append("-f") if clusters_loc: settings.append("-c") msg.warn( "Incompatible arguments", "The -f and -c arguments are deprecated, and not compatible " "with the -j argument, which should specify the same " "information. Either merge the frequencies and clusters data " "into the JSONL-formatted file (recommended), or use only the " "-f and -c files, without the other lexical attributes.", ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name) msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp
def create_data(cfg: Config) -> Tuple[InputData, InputData]: data = list(srsly.read_jsonl(Path(cfg.path).expanduser())) if cfg.ndata > 0: data = random.sample(data, k=cfg.ndata) else: cfg.ndata = len(data) train, val = train_test_split(data, test_size=cfg.val_size) srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train) srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val) return train, val
def evaluate_model(model, eval_path): """ Evaluate a trained model on Prodigy annotations and print the accuracy. """ with msg.loading(f"Loading model '{model}'..."): nlp = spacy.load(model) data, _ = format_data(srsly.read_jsonl(eval_path)) sc = nlp.evaluate(data) result = [("F-Score", f"{sc.textcat_score:.3f}")] msg.table(result)
def func(): nlp = spacy_load("en_core_web_sm") doc = nlp(text_path.read_text()) matcher = Matcher(doc.vocab) patterns = [ p["pattern"] if "pattern" in p else p for p in read_jsonl(patterns_path) ] matcher.add("Profile", patterns) matcher(doc)
def read_gold_data(nlp, gold_loc): docs = [] golds = [] for json_obj in srsly.read_jsonl(gold_loc): doc = nlp.make_doc(json_obj["text"]) ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]] gold = GoldParse(doc, entities=ents) docs.append(doc) golds.append(gold) return list(zip(docs, golds))
def init_model( lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, vectors_loc=None, prune_vectors=-1, ): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] if freqs_loc: settings.append("-f") if clusters_loc: settings.append("-c") msg.warn( "Incompatible arguments", "The -f and -c arguments are deprecated, and not compatible " "with the -j argument, which should specify the same " "information. Either merge the frequencies and clusters data " "into the JSONL-formatted file (recommended), or use only the " "-f and -c files, without the other lexical attributes.", ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs) msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, prune_vectors) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp
def read_json(path: Path) -> List[Example]: """Read annotations in JSON file format Args: path (Path): Path to data Returns: List[Example]: List of examples """ data = srsly.read_jsonl(path) examples = json_to_examples(data) return examples
def _main(cfg): cfg = parse(cfg) if cfg.seed: set_seed(cfg.seed) org_cwd = hydra.utils.get_original_cwd() logger.info(cfg.pretty()) nlp = cast(TorchLanguage, create_model(cfg.model)) train_data = list( srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.train))) cfg.train.data.ndata = len(train_data) val_data = list(srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.val))) logger.info("output dir: {}".format(os.getcwd())) if torch.cuda.is_available(): logger.info("CUDA enabled") nlp.to(torch.device("cuda")) savedir = Path.cwd() / "models" srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train_data) srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val_data) savedir.mkdir(exist_ok=True) train(cfg.train, nlp, train_data, val_data, savedir)
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin() data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path)) for doc, eg in nlp.pipe(data_tuples, as_tuples=True): # doc.cats = {category: 0 for category in CATEGORIES} doc.cats[eg["label"]] = 1 doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def load_data(filepath): examples = list(srsly.read_jsonl(filepath)) rows = [] n_total_ents = 0 n_no_ents = 0 labels = set() for eg in examples: row = {"text": eg["text"], "ents": eg.get("spans", [])} n_total_ents += len(row["ents"]) if not row["ents"]: n_no_ents += 1 labels.update([span["label"] for span in row["ents"]]) rows.append(row) return rows, labels, n_total_ents, n_no_ents
def from_disk(self, path, **kwargs): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) path = path.with_suffix(".jsonl") patterns = srsly.read_jsonl(path) self.add_patterns(patterns) return self
def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": with msg.loading("Loading {}...".format(file_name)): data = srsly.read_json(file_path) msg.good("Loaded {}".format(file_name)) return data elif file_path.suffix == ".jsonl": with msg.loading("Loading {}...".format(file_name)): data = srsly.read_jsonl(file_path) msg.good("Loaded {}".format(file_name)) return data msg.fail( "Can't load file extension {}".format(file_path.suffix), "Expected .json or .jsonl", exits=1, )
def read_raw_data(nlp, jsonl_loc): for json_obj in srsly.read_jsonl(jsonl_loc): if json_obj["text"].strip(): doc = nlp.make_doc(json_obj["text"]) yield doc
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe(pipe)) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: nlp.add_pipe(nlp.create_pipe(pipe)) if learn_tokens: nlp.add_pipe(nlp.create_pipe("merge_subtokens")) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail( "Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name) ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7) row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text( "Early stopping, best iteration " "is: {}".format(i - iter_since_best) ) msg.text( "Best score = {}; Final iteration " "score = {}".format(best_score, current_score) ) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, # Set to False for Chinese etc ), ) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") for epoch in range(n_iter): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts)