def main(): data = pd.read_csv(args.input_file_path, sep=",") columns = list(data.columns) output_file_path = args.input_file_path.replace(".csv", ".jsonl", 1) randomized_output_file_path = path.join( path.dirname(output_file_path), "randomized_" + path.basename(output_file_path)) json_data = [] for index, row in data.iterrows(): line_contents = { } # each dictionary holds all information of a single line for column in columns: if column == "text": row[column] = row[column].replace('\n', '') if pd.isna(row[column]): row[column] = "" line_contents[column] = row[column] json_data.append(line_contents) #if want to randomize the list json_data_shuffled = random.sample(json_data, len(json_data)) srsly.write_jsonl(output_file_path, json_data) srsly.write_jsonl(randomized_output_file_path, json_data_shuffled)
def main(): dataset_name = args.dataset_name # the dataset you want to use #with open("settings.json", "r") as read_file: # data = json.load(read_file) # Connect to the database using the prodigy.json file (Can also be found in slack) #db = connect(data["db"],data["db_settings"]) db = connect( ) #Prodigy automatically will use the settings in 'prodigy.json' file in this script's directory if running from this directory # The dataset will be returned as an object dataset = db.get_dataset(dataset_name) file_ext = "jsonl" # modify this if you want it to be saved as a different file format out_path = args.output_path #"./" # location of where the dataset will be saved, default is same directory as script # Name of the file being saved, we use uuid.uuid4() to avoid overwriting files outfile = os.path.join( out_path, f"{dataset_name}_download.{uuid.uuid4()}.{file_ext}") # if youre writing it as json use .write_json instead, refer to srsly documentation for other formats # or handle file writing yourself srsly.write_jsonl(outfile, dataset)
def write_sample_jsonl(tmp_dir): data = [ { "meta": { "id": "1" }, "text": "This is the best TV you'll ever buy!", "cats": { "pos": 1, "neg": 0 }, }, { "meta": { "id": "2" }, "text": "I wouldn't buy this again.", "cats": { "pos": 0, "neg": 1 }, }, ] file_path = f"{tmp_dir}/text.jsonl" srsly.write_jsonl(file_path, data) return file_path
def to_disk(self, output_path: Path, force: bool = False, save_examples: bool = True) -> None: """Save Corpus to Disk Args: output_path (Path): Output file path to save data to force (bool): Force save to directory. Create parent directories or overwrite existing data. save_examples (bool): Save the example store along with the state. """ output_path = ensure_path(output_path) output_dir = output_path.parent state_dir = output_dir / ".recon" / self.name if force: output_dir.mkdir(parents=True, exist_ok=True) if not state_dir.exists(): state_dir.mkdir(parents=True, exist_ok=True) ds_op_state = DatasetOperationsState( name=self.name, commit=self.commit_hash, size=len(self), operations=self.operations ) srsly.write_json(state_dir / "state.json", ds_op_state.dict()) if save_examples: self.example_store.to_disk(state_dir / "example_store.jsonl") srsly.write_jsonl(output_path, [e.dict() for e in self.data])
def to_disk(self, path, **kwargs): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). path (unicode / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#to_disk """ path = ensure_path(path) cfg = { "overwrite": self.overwrite, "phrase_matcher_attr": self.phrase_matcher_attr, "ent_id_sep": self.ent_id_sep, } serializers = { "patterns": lambda p: srsly.write_jsonl(p.with_suffix(".jsonl"), self.patterns ), "cfg": lambda p: srsly.write_json(p, cfg), } if path.suffix == ".jsonl": # user wants to save only JSONL srsly.write_jsonl(path, self.patterns) else: to_disk(path, serializers, {})
def convert_mishnah_and_tosefta_to_mentions(tractate_prefix, in_file, out_file1, out_file2, vtitle, title_map=None): import json title_map = title_map or {} mentions = [] crude_mentions = [] issues = 0 with open(in_file, "r") as fin: c = csv.DictReader(fin) for row in c: row["Tractate"] = title_map.get(row["Tractate"], row["Tractate"]) tref = f'{row["Tractate"]} {row["Chapter"]}:{row["Number"]}' if not row['Tractate'].startswith('Pirkei Avot'): tref = tractate_prefix + tref oref = Ref(tref) context = row["Context"] crude_mentions += [{ "Book": oref.index.title, "Segment": oref.normal(), "Bonayich ID": row["rabbi_id"], "Context": context }] print("Issues", issues) spacy_formatted, rabbi_mentions = convert_to_spacy_format(crude_mentions, vtitle=vtitle, norm_regex="[,\-:;\u0591-\u05bd\u05bf-\u05c5\u05c7]+", repl='', daf_skips=0, rashi_skips=0, overall=0) srsly.write_jsonl(out_file1, rabbi_mentions) convert_to_mentions_file(out_file1, out_file2, only_bonayich_rabbis=False) with open(f'{out_file2}', 'r') as fin: j = json.load(fin) with open(f'{DATA_LOC}/../sefaria/{out_file2}', 'w') as fout: json.dump(j, fout, indent=2, ensure_ascii=False)
def to_patterns(dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False): """ Convert a dataset of phrases collected with sense2vec.teach to token-based match patterns that can be used with spaCy's EntityRuler or recipes like ner.match. If no output file is specified, the patterns are written to stdout. The examples are tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: raise ValueError(f"Can't find dataset '{dataset}'") examples = DB.get_dataset(dataset) terms = [eg["text"] for eg in examples if eg["answer"] == "accept"] if case_sensitive: patterns = [{"text": t.text for t in nlp.make_doc(term)} for term in terms] else: patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") if not dry: srsly.write_jsonl(output_file, patterns) return patterns
def to_patterns(dataset=None, label=None, output_file=None): """ Convert a list of seed phrases to a list of match patterns that can be used with ner.match. If no output file is specified, each pattern is printed so the recipe's output can be piped forward to ner.match. This is pretty much an exact copy of terms.to-patterns. The pattern for each example is just split on whitespace so instead of: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} which won't match anything you'll get: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ if label is None: prints( "--label is a required argument", "This is the label that will be assigned to all patterns " "created from terms collected in this dataset. ", exits=1, error=True, ) DB = connect() def get_pattern(term, label): return { "label": label, "pattern": [{ "lower": t.lower() } for t in term["text"].split()] } log("RECIPE: Starting recipe phrases.to-patterns", locals()) if dataset is None: log("RECIPE: Reading input terms from sys.stdin") terms = (srsly.json_loads(line) for line in sys.stdin) else: if dataset not in DB: prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) terms = DB.get_dataset(dataset) log("RECIPE: Reading {} input phrases from dataset {}".format( len(terms), dataset)) if output_file: patterns = [ get_pattern(term, label) for term in terms if term["answer"] == "accept" ] log("RECIPE: Generated {} patterns".format(len(patterns))) srsly.write_jsonl(output_file, patterns) prints("Exported {} patterns".format(len(patterns)), output_file) else: log("RECIPE: Outputting patterns") for term in terms: if term["answer"] == "accept": print(srsly.json_dumps(get_pattern(term, label)))
def jsonl_writer(type_, id_, iter): path = os.path.join( output_dir, generate_git_export_file_name("jsonl", customer_id, source_id, id_, type_), ) srsly.write_jsonl(path, iter)
def make_raw_data(jsonl_loc): categories = ['Tanakh', 'Mishnah'] books = ['Midrash Tanchuma', 'Pirkei DeRabbi Eliezer', 'Sifra', 'Sifrei Bamidbar', 'Sifrei Devarim', 'Mishneh Torah, Foundations of the Torah', 'Mishneh Torah, Human Dispositions', 'Mishneh Torah, Reading the Shema', 'Mishneh Torah, Sabbath', 'Avot D\'Rabbi Natan', 'Guide for the Perplexed', 'Nineteen Letters', 'Collected Responsa in Wartime', 'Contemporary Halakhic Problems, Vol I', 'Contemporary Halakhic Problems, Vol II', 'Contemporary Halakhic Problems, Vol III', 'Contemporary Halakhic Problems, Vol IV', 'Depths of Yonah', 'Likutei Moharan', 'Kedushat Levi', 'Messilat Yesharim', 'Orchot Tzadikim', 'Shemirat HaLashon'] for cat in categories: books += library.get_indexes_in_category(cat) data = [] for b in tqdm(books): i = library.get_index(b) default_en = None for v in i.versionSet(): if v.language == 'en': default_en = v break if default_en is None: continue def action(data, temp_text, tref, heTref, self): data += [normalize_text('en', temp_text)] default_en.walk_thru_contents(partial(action, data)) srsly.write_jsonl(jsonl_loc, data)
def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). path (str / Path): The JSONL file to save. DOCS: https://spacy.io/api/entityruler#to_disk """ path = ensure_path(path) cfg = { "overwrite": self.overwrite, "phrase_matcher_attr": self.phrase_matcher_attr, "ent_id_sep": self.ent_id_sep, } serializers = { "patterns": lambda p: srsly.write_jsonl( p.with_suffix(".jsonl"), self.patterns ), "cfg": lambda p: srsly.write_json(p, cfg), } if path.suffix == ".jsonl": # user wants to save only JSONL srsly.write_jsonl(path, self.patterns) else: to_disk(path, serializers, {})
def convert( input_file, output_dir="-", file_type="jsonl", n_sents=1, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSONL file: $ spacy convert some_file.conllu > some_file.jsonl """ msg = Printer() input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) if converter == "auto": converter = input_path.suffix[1:] if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] input_data = input_path.open("r", encoding="utf-8").read() data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)
def convert_training_to_displacy(jsonl_loc): out = [] for text, tags in srsly.read_jsonl(jsonl_loc): out += [{ 'text': text, 'ents': sorted([{'start': s, 'end': e, 'label': l} for s, e, l in tags['entities']], key=lambda x: x['start']) }] srsly.write_jsonl(jsonl_loc + '.displacy', out)
def combine(): total_he = 0 total_combined = 0 he_data = srsly.read_jsonl(f"{DATA_LOC}/he_mentions.jsonl") en_data = srsly.read_jsonl(f"{DATA_LOC}/en_mentions.jsonl") he_ref_map = defaultdict(list) en_ref_map = defaultdict(list) for he_row in he_data: he_ref_map[he_row["Ref"]] += [he_row] for en_row in en_data: en_ref_map[en_row["Ref"]] += [en_row] combined_data = [] missing_data = [] for tref, he_rows in he_ref_map.items(): en_rows = en_ref_map[he_rows[0]["Ref"]] he_ids = {int(he_row["Bonayich ID"]) for he_row in he_rows} new_row = { "Book": he_rows[0]["Book"], "Ref": he_rows[0]["Ref"], "He Mentions": [{ "Start": he_row["Start"], "End": he_row["End"], "Bonayich ID": int(he_row["Bonayich ID"]), "Mention": he_row["Mention"] } for he_row in he_rows], "En Mentions": [{ "Start": en_row["Start"], "End": en_row["End"], "Bonayich ID": int(en_row["Bonayich ID"]) if en_row["Bonayich ID"] is not None else None, "Mention": en_row["Mention"] } for en_row in en_rows], } new_row["En Mentions Filtered"] = list( filter(lambda x: x["Bonayich ID"] in he_ids, new_row["En Mentions"])) en_filtered_ids = { int(he_row["Bonayich ID"]) for he_row in new_row["En Mentions Filtered"] } new_row["He Mentions Filtered"] = list( filter(lambda x: x["Bonayich ID"] in en_filtered_ids, new_row["He Mentions"])) total_he += len(new_row["He Mentions"]) total_combined += len(new_row["En Mentions Filtered"]) if len(new_row["He Mentions"]) > len(new_row["En Mentions Filtered"]): missing_data += [new_row] combined_data += [new_row] srsly.write_jsonl(f"{DATA_LOC}/combined_mentions.jsonl", combined_data) with open(f"{DATA_LOC}/missing_mentions.jsonl", "w") as fout: json.dump(missing_data, fout, ensure_ascii=False, indent=2) print(total_he, total_combined)
def make_prodigy_input_by_refs(ref_list, lang, vtitle): walker = ProdigyInputWalker([]) input_list = [] for tref in ref_list: oref = Ref(tref) text = walker.normalizer.normalize(oref.text(lang, vtitle=vtitle).text) temp_input_list = walker.get_input(text, tref, lang) input_list += temp_input_list srsly.write_jsonl('data/test_input.jsonl', input_list)
def create_data(cfg: Config) -> Tuple[InputData, InputData]: data = list(srsly.read_jsonl(Path(cfg.path).expanduser())) if cfg.ndata > 0: data = random.sample(data, k=cfg.ndata) else: cfg.ndata = len(data) train, val = train_test_split(data, test_size=cfg.val_size) srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train) srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val) return train, val
def main(): text = input() MAX_LENGTH = 100 lines = [] while text: length = random.randint(1, min(MAX_LENGTH, len(text))) cur, text = text[:length], text[length:] ners = gen_ner_span(length) lines.append([cur, {"entities": ners}]) srsly.write_jsonl("-", lines)
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) with make_tempdir() as tmpdir: out_file = tmpdir / "entity_ruler" srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) new_ruler = EntityRuler(nlp).from_disk(out_file) for pattern in ruler.patterns: assert pattern in new_ruler.patterns assert len(new_ruler) == len(ruler) assert new_ruler.overwrite is not ruler.overwrite
def to_disk(self, path: Path) -> None: """Save store to disk Args: path (Path): Path to save store to """ path = ensure_path(path) examples = [] for example_hash, example in self._map.items(): examples.append({"example_hash": example_hash, "example": example.dict()}) srsly.write_jsonl(path, examples)
def to_disk(self, path, **kwargs): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler """ path = ensure_path(path) path = path.with_suffix(".jsonl") srsly.write_jsonl(path, self.patterns)
def to_disk(self, path, **kwargs): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#to_disk """ path = ensure_path(path) path = path.with_suffix(".jsonl") srsly.write_jsonl(path, self.patterns)
def tag_all(self, start=0, end=None, category='Bavli'): talmud = library.get_indexes_in_category(category, full_records=True) training = [] mentions = [] for mes in tqdm(talmud[start:end], desc='Books'): temp_training, temp_mentions = self.tag_index(mes) training += temp_training mentions += temp_mentions srsly.write_jsonl( '/home/nss/sefaria/datasets/ner/michael-sperling/en_training.jsonl', training) srsly.write_jsonl( '/home/nss/sefaria/datasets/ner/michael-sperling/en_mentions.jsonl', mentions)
def _build_patterns(self, skills: list, create: bool = False): """Build all matcher patterns""" patterns_path = self.data_path / "skill_patterns.jsonl" if not patterns_path.exists() or create: """Build up lists of spacy token patterns for matcher""" patterns = [] split_tokens = [".", "/", "-"] for skill_id, skill_info in skills.items(): aliases = skill_info['aliases'] sources = skill_info['sources'] skill_names = set() for al in aliases: skill_names.add(al) for source in sources: if "displayName" in source: skill_names.add(source["displayName"]) for name in skill_names: if name.upper() == name: skill_name = name else: skill_name = name.lower().strip() if skill_name not in STOP_WORDS: pattern = self._skill_pattern(skill_name) if pattern: label = f"SKILL|{skill_id}" patterns.append({ "label": label, "pattern": pattern }) for t in split_tokens: if t in skill_name: patterns.append({ "label": label, "pattern": self._skill_pattern(skill_name, t), }) srsly.write_jsonl(patterns_path, patterns) return patterns else: patterns = srsly.read_jsonl(patterns_path) return patterns
def convert( lang: str = "en", input_path: Path = Path("../assets/docs_doctypes_all.xlsx"), sheet_name: str = "Sheet1", output_path: Path = Path("../assets/docs_doctypes_all.jsonl"), append: bool = False, ): # Read excel document df = pandas.read_excel(input_path, sheet_name=sheet_name) # Convert excel json as_json = df.to_json(orient="records") json_input = json.loads(as_json) # print(type(json_input)) srsly.write_jsonl(path=output_path, lines=json_input, append=append, append_new_line=True)
def to_patterns(dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False): """ Convert a dataset of phrases collected with sense2vec.teach to token-based match patterns that can be used with spaCy's EntityRuler or recipes like ner.match. If no output file is specified, the patterns are written to stdout. The examples are tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} For tokenization, you can either pass in the name of a spaCy model (e.g. if you're using a model with custom tokenization), or "blank:" plus the language code you want to use, e.g. blank:en or blank:de. Make sure to use the same language / tokenizer you're planning to use at runtime – otherwise your patterns may not match. """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) if spacy_model.startswith("blank:"): nlp = spacy.blank(spacy_model.replace("blank:", "")) else: nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: msg.fail(f"Can't find dataset '{dataset}'", exits=1) examples = DB.get_dataset(dataset) terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"]) if case_sensitive: patterns = [[{ "text": t.lower_ } for t in nlp.make_doc(term)] for term in terms] else: terms = set([word.lower() for word in terms]) patterns = [[{ "lower": t.lower_ } for t in nlp.make_doc(term)] for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") if not dry: srsly.write_jsonl(output_file, patterns) return patterns
def test_issue_3526_3(en_vocab): patterns = [ {"label": "HELLO", "pattern": "hello world"}, {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, ] nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) with make_tempdir() as tmpdir: out_file = tmpdir / "entity_ruler" srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) new_ruler = EntityRuler(nlp).from_disk(out_file) for pattern in ruler.patterns: assert pattern in new_ruler.patterns assert len(new_ruler) == len(ruler) assert new_ruler.overwrite is not ruler.overwrite
def to_disk(self: SpaczzRuler, path: Union[str, Path], **kwargs: Any) -> None: """Save the spaczz ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). Args: path: The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. Example: >>> import os >>> import tempfile >>> import spacy >>> from spaczz.pipeline import SpaczzRuler >>> nlp = spacy.blank("en") >>> ruler = SpaczzRuler(nlp) >>> ruler.add_patterns([{"label": "AUTHOR", "pattern": "Kerouac", "type": "fuzzy"}]) >>> with tempfile.TemporaryDirectory() as tmpdir: >>> ruler.to_disk(f"{tmpdir}/ruler") >>> isdir = os.path.isdir(f"{tmpdir}/ruler") >>> isdir True """ path = ensure_path(path) cfg = { "spaczz_overwrite": self.overwrite, "spaczz_defaults": self.defaults, "spaczz_ent_id_sep": self.ent_id_sep, } serializers = { "spaczz_patterns": lambda p: srsly.write_jsonl(p.with_suffix(".jsonl"), self.patterns ), "cfg": lambda p: srsly.write_json(p, cfg), } if path.suffix == ".jsonl": # user wants to save only JSONL srsly.write_jsonl(path, self.patterns) else: write_to_disk(path, serializers, {})
def save(self): os.makedirs(self.data_dir, exist_ok=True) self.tokenizer.save_pretrained(self.data_dir) with open(self.classes_path, "w") as out_fp: json.dump(self.label_to_id, out_fp) with open(self.dataset_sizes_path, "w") as out_fp: json.dump( {section: len(texts) for section, texts in self.texts.items()}, out_fp) for section, texts in self.texts.items(): if section == "train": # sort documents by the number of sentences for faster training texts = sorted(texts, key=lambda x: len(x["sentences"]), reverse=True) srsly.write_jsonl(os.path.join(self.data_dir, section + ".jsonl"), texts)
def make_prodigy_input(title_list, vtitle_list, lang_list, prev_tagged_refs): walker = ProdigyInputWalker(prev_tagged_refs) for title, vtitle, lang in tqdm(zip(title_list, vtitle_list, lang_list), total=len(title_list)): if vtitle is None: version = VersionSet({ "title": title, "language": lang }, sort=[("priority", -1)], limit=1).array()[0] else: version = Version().load({ "title": title, "versionTitle": vtitle, "language": lang }) version.walk_thru_contents(walker.action) walker.make_final_input(400) srsly.write_jsonl('data/test_input.jsonl', walker.prodigyInput)
def _main(cfg): cfg = parse(cfg) if cfg.seed: set_seed(cfg.seed) org_cwd = hydra.utils.get_original_cwd() logger.info(cfg.pretty()) nlp = cast(TorchLanguage, create_model(cfg.model)) train_data = list( srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.train))) cfg.train.data.ndata = len(train_data) val_data = list(srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.val))) logger.info("output dir: {}".format(os.getcwd())) if torch.cuda.is_available(): logger.info("CUDA enabled") nlp.to(torch.device("cuda")) savedir = Path.cwd() / "models" srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train_data) srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val_data) savedir.mkdir(exist_ok=True) train(cfg.train, nlp, train_data, val_data, savedir)
def make_evaluation_files(evaluation_data, ner_model, output_folder, start=0, lang='he'): tp,fp,fn,tn = 0,0,0,0 data_tuples = [(eg.text, eg) for eg in evaluation_data] output_json = [] # see https://spacy.io/api/language#pipe for iexample, (doc, example) in enumerate(tqdm(ner_model.pipe(data_tuples, as_tuples=True))): if iexample < start: continue # correct_ents ents_x2y = example.get_aligned_spans_x2y(example.reference.ents) correct_ents = {(e.start_char, e.end_char, e.label_) for e in ents_x2y} # predicted_ents ents_x2y = example.get_aligned_spans_x2y(doc.ents) predicted_ents = {(e.start_char, e.end_char, e.label_) for e in ents_x2y} # false positives temp_fp = [ent for ent in predicted_ents if ent not in correct_ents] fp += len(temp_fp) # true positives temp_tp = [ent for ent in predicted_ents if ent in correct_ents] tp += len(temp_tp) # false negatives temp_fn = [ent for ent in correct_ents if ent not in predicted_ents] fn += len(temp_fn) # true negatives temp_tn = [ent for ent in correct_ents if ent in predicted_ents] tn += len(temp_tn) output_json += [{ "text": doc.text, "tp": [list(ent) for ent in temp_tp], "fp": [list(ent) for ent in temp_fp], "fn": [list(ent) for ent in temp_fn], "ref": example.predicted.user_data['Ref'], "_id": example.predicted.user_data['_id'], }] srsly.write_jsonl(f"{output_folder}/doc_evaluation.jsonl", output_json) make_evaluation_html(output_json, output_folder, 'doc_evaluation.html', lang) print('PRECISION', 100*round(tp/(tp+fp), 4)) print('RECALL ', 100*round(tp/(tp+fn), 4)) print('F1 ', 100*round(tp/(tp + 0.5 * (fp + fn)),4)) return tp, fp, tn, fn