Exemple #1
0
def main():
    data = pd.read_csv(args.input_file_path, sep=",")

    columns = list(data.columns)

    output_file_path = args.input_file_path.replace(".csv", ".jsonl", 1)
    randomized_output_file_path = path.join(
        path.dirname(output_file_path),
        "randomized_" + path.basename(output_file_path))
    json_data = []
    for index, row in data.iterrows():
        line_contents = {
        }  # each dictionary holds all information of a single line

        for column in columns:
            if column == "text":
                row[column] = row[column].replace('\n', '')
            if pd.isna(row[column]):
                row[column] = ""
            line_contents[column] = row[column]
        json_data.append(line_contents)

    #if want to randomize the list
    json_data_shuffled = random.sample(json_data, len(json_data))

    srsly.write_jsonl(output_file_path, json_data)
    srsly.write_jsonl(randomized_output_file_path, json_data_shuffled)
def main():

    dataset_name = args.dataset_name  # the dataset you want to use

    #with open("settings.json", "r") as read_file:
    #	data = json.load(read_file)

    # Connect to the database using the prodigy.json file (Can also be found in slack)
    #db = connect(data["db"],data["db_settings"])
    db = connect(
    )  #Prodigy automatically will use the settings in 'prodigy.json' file in this script's directory if running from this directory

    # The dataset will be returned as an object
    dataset = db.get_dataset(dataset_name)

    file_ext = "jsonl"  # modify this if you want it to be saved as a different file format

    out_path = args.output_path  #"./" # location of where the dataset will be saved, default is same directory as script

    # Name of the file being saved, we use uuid.uuid4() to avoid overwriting files
    outfile = os.path.join(
        out_path, f"{dataset_name}_download.{uuid.uuid4()}.{file_ext}")

    # if youre writing it as json use .write_json instead, refer to srsly documentation for other formats
    # or handle file writing yourself
    srsly.write_jsonl(outfile, dataset)
Exemple #3
0
def write_sample_jsonl(tmp_dir):
    data = [
        {
            "meta": {
                "id": "1"
            },
            "text": "This is the best TV you'll ever buy!",
            "cats": {
                "pos": 1,
                "neg": 0
            },
        },
        {
            "meta": {
                "id": "2"
            },
            "text": "I wouldn't buy this again.",
            "cats": {
                "pos": 0,
                "neg": 1
            },
        },
    ]
    file_path = f"{tmp_dir}/text.jsonl"
    srsly.write_jsonl(file_path, data)
    return file_path
Exemple #4
0
    def to_disk(self, output_path: Path, force: bool = False, save_examples: bool = True) -> None:
        """Save Corpus to Disk

        Args:
            output_path (Path): Output file path to save data to
            force (bool): Force save to directory. Create parent directories
                or overwrite existing data.
            save_examples (bool): Save the example store along with the state.
        """
        output_path = ensure_path(output_path)
        output_dir = output_path.parent
        state_dir = output_dir / ".recon" / self.name
        if force:
            output_dir.mkdir(parents=True, exist_ok=True)

            if not state_dir.exists():
                state_dir.mkdir(parents=True, exist_ok=True)

        ds_op_state = DatasetOperationsState(
            name=self.name, commit=self.commit_hash, size=len(self), operations=self.operations
        )
        srsly.write_json(state_dir / "state.json", ds_op_state.dict())

        if save_examples:
            self.example_store.to_disk(state_dir / "example_store.jsonl")

        srsly.write_jsonl(output_path, [e.dict() for e in self.data])
Exemple #5
0
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (unicode / Path): The JSONL file to save.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns":
            lambda p: srsly.write_jsonl(p.with_suffix(".jsonl"), self.patterns
                                        ),
            "cfg":
            lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})
def convert_mishnah_and_tosefta_to_mentions(tractate_prefix, in_file, out_file1, out_file2, vtitle, title_map=None):
    import json
    title_map = title_map or {}
    mentions = []
    crude_mentions = []
    issues = 0
    with open(in_file, "r") as fin:
        c = csv.DictReader(fin)
        for row in c:
            row["Tractate"] = title_map.get(row["Tractate"], row["Tractate"])
            tref = f'{row["Tractate"]} {row["Chapter"]}:{row["Number"]}'
            if not row['Tractate'].startswith('Pirkei Avot'):
                tref = tractate_prefix + tref
            oref = Ref(tref)
            context = row["Context"]
            crude_mentions += [{
                "Book": oref.index.title,
                "Segment": oref.normal(),
                "Bonayich ID": row["rabbi_id"],
                "Context": context
            }]
    print("Issues", issues)

    spacy_formatted, rabbi_mentions = convert_to_spacy_format(crude_mentions, vtitle=vtitle, norm_regex="[,\-:;\u0591-\u05bd\u05bf-\u05c5\u05c7]+", repl='', daf_skips=0, rashi_skips=0, overall=0)
    srsly.write_jsonl(out_file1, rabbi_mentions)
    convert_to_mentions_file(out_file1, out_file2, only_bonayich_rabbis=False)
    with open(f'{out_file2}', 'r') as fin:
        j = json.load(fin)
    with open(f'{DATA_LOC}/../sefaria/{out_file2}', 'w') as fout:
        json.dump(j, fout, indent=2, ensure_ascii=False)
Exemple #7
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        raise ValueError(f"Can't find dataset '{dataset}'")
    examples = DB.get_dataset(dataset)
    terms = [eg["text"] for eg in examples if eg["answer"] == "accept"]
    if case_sensitive:
        patterns = [{"text": t.text
                     for t in nlp.make_doc(term)} for term in terms]
    else:
        patterns = [{"lower": t.lower_
                     for t in nlp.make_doc(term)} for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
Exemple #8
0
def to_patterns(dataset=None, label=None, output_file=None):
    """
    Convert a list of seed phrases to a list of match patterns that can be used
    with ner.match. If no output file is specified, each pattern is printed
    so the recipe's output can be piped forward to ner.match.

    This is pretty much an exact copy of terms.to-patterns.
    The pattern for each example is just split on whitespace so instead of:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]}


    which won't match anything you'll get:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    if label is None:
        prints(
            "--label is a required argument",
            "This is the label that will be assigned to all patterns "
            "created from terms collected in this dataset. ",
            exits=1,
            error=True,
        )

    DB = connect()

    def get_pattern(term, label):
        return {
            "label": label,
            "pattern": [{
                "lower": t.lower()
            } for t in term["text"].split()]
        }

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    if dataset is None:
        log("RECIPE: Reading input terms from sys.stdin")
        terms = (srsly.json_loads(line) for line in sys.stdin)
    else:
        if dataset not in DB:
            prints("Can't find dataset '{}'".format(dataset),
                   exits=1,
                   error=True)
        terms = DB.get_dataset(dataset)
        log("RECIPE: Reading {} input phrases from dataset {}".format(
            len(terms), dataset))
    if output_file:
        patterns = [
            get_pattern(term, label) for term in terms
            if term["answer"] == "accept"
        ]
        log("RECIPE: Generated {} patterns".format(len(patterns)))
        srsly.write_jsonl(output_file, patterns)
        prints("Exported {} patterns".format(len(patterns)), output_file)
    else:
        log("RECIPE: Outputting patterns")
        for term in terms:
            if term["answer"] == "accept":
                print(srsly.json_dumps(get_pattern(term, label)))
Exemple #9
0
 def jsonl_writer(type_, id_, iter):
     path = os.path.join(
         output_dir,
         generate_git_export_file_name("jsonl", customer_id, source_id, id_,
                                       type_),
     )
     srsly.write_jsonl(path, iter)
def make_raw_data(jsonl_loc):
    categories = ['Tanakh', 'Mishnah']
    books = ['Midrash Tanchuma', 'Pirkei DeRabbi Eliezer', 'Sifra', 'Sifrei Bamidbar', 'Sifrei Devarim',
             'Mishneh Torah, Foundations of the Torah', 'Mishneh Torah, Human Dispositions',
             'Mishneh Torah, Reading the Shema', 'Mishneh Torah, Sabbath', 'Avot D\'Rabbi Natan',
             'Guide for the Perplexed', 'Nineteen Letters', 'Collected Responsa in Wartime',
             'Contemporary Halakhic Problems, Vol I', 'Contemporary Halakhic Problems, Vol II',
             'Contemporary Halakhic Problems, Vol III', 'Contemporary Halakhic Problems, Vol IV', 'Depths of Yonah',
             'Likutei Moharan', 'Kedushat Levi', 'Messilat Yesharim', 'Orchot Tzadikim', 'Shemirat HaLashon']
    for cat in categories:
        books += library.get_indexes_in_category(cat)
    data = []
    for b in tqdm(books):
        i = library.get_index(b)
        default_en = None
        for v in i.versionSet():
            if v.language == 'en':
                default_en = v
                break
        if default_en is None:
            continue

        def action(data, temp_text, tref, heTref, self):
            data += [normalize_text('en', temp_text)]
        default_en.walk_thru_contents(partial(action, data))
    srsly.write_jsonl(jsonl_loc, data)
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (str / Path): The JSONL file to save.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        cfg = {
            "overwrite": self.overwrite,
            "phrase_matcher_attr": self.phrase_matcher_attr,
            "ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "patterns": lambda p: srsly.write_jsonl(
                p.with_suffix(".jsonl"), self.patterns
            ),
            "cfg": lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            to_disk(path, serializers, {})
def convert(
    input_file,
    output_dir="-",
    file_type="jsonl",
    n_sents=1,
    morphology=False,
    converter="auto",
    lang=None,
):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSONL file:
    $ spacy convert some_file.conllu > some_file.jsonl
    """
    msg = Printer()
    input_path = Path(input_file)
    if file_type not in FILE_TYPES:
        msg.fail(
            "Unknown file type: '{}'".format(file_type),
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    input_data = input_path.open("r", encoding="utf-8").read()
    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
def convert_training_to_displacy(jsonl_loc):
    out = []
    for text, tags in srsly.read_jsonl(jsonl_loc):
        out += [{
            'text': text,
            'ents': sorted([{'start': s, 'end': e, 'label': l} for s, e, l in tags['entities']], key=lambda x: x['start'])
        }]
    srsly.write_jsonl(jsonl_loc + '.displacy', out)
def combine():
    total_he = 0
    total_combined = 0
    he_data = srsly.read_jsonl(f"{DATA_LOC}/he_mentions.jsonl")
    en_data = srsly.read_jsonl(f"{DATA_LOC}/en_mentions.jsonl")
    he_ref_map = defaultdict(list)
    en_ref_map = defaultdict(list)
    for he_row in he_data:
        he_ref_map[he_row["Ref"]] += [he_row]
    for en_row in en_data:
        en_ref_map[en_row["Ref"]] += [en_row]
    combined_data = []
    missing_data = []
    for tref, he_rows in he_ref_map.items():
        en_rows = en_ref_map[he_rows[0]["Ref"]]
        he_ids = {int(he_row["Bonayich ID"]) for he_row in he_rows}
        new_row = {
            "Book":
            he_rows[0]["Book"],
            "Ref":
            he_rows[0]["Ref"],
            "He Mentions": [{
                "Start": he_row["Start"],
                "End": he_row["End"],
                "Bonayich ID": int(he_row["Bonayich ID"]),
                "Mention": he_row["Mention"]
            } for he_row in he_rows],
            "En Mentions": [{
                "Start":
                en_row["Start"],
                "End":
                en_row["End"],
                "Bonayich ID":
                int(en_row["Bonayich ID"])
                if en_row["Bonayich ID"] is not None else None,
                "Mention":
                en_row["Mention"]
            } for en_row in en_rows],
        }
        new_row["En Mentions Filtered"] = list(
            filter(lambda x: x["Bonayich ID"] in he_ids,
                   new_row["En Mentions"]))
        en_filtered_ids = {
            int(he_row["Bonayich ID"])
            for he_row in new_row["En Mentions Filtered"]
        }
        new_row["He Mentions Filtered"] = list(
            filter(lambda x: x["Bonayich ID"] in en_filtered_ids,
                   new_row["He Mentions"]))
        total_he += len(new_row["He Mentions"])
        total_combined += len(new_row["En Mentions Filtered"])
        if len(new_row["He Mentions"]) > len(new_row["En Mentions Filtered"]):
            missing_data += [new_row]
        combined_data += [new_row]
    srsly.write_jsonl(f"{DATA_LOC}/combined_mentions.jsonl", combined_data)
    with open(f"{DATA_LOC}/missing_mentions.jsonl", "w") as fout:
        json.dump(missing_data, fout, ensure_ascii=False, indent=2)
    print(total_he, total_combined)
Exemple #15
0
def make_prodigy_input_by_refs(ref_list, lang, vtitle):
    walker = ProdigyInputWalker([])
    input_list = []
    for tref in ref_list:
        oref = Ref(tref)
        text = walker.normalizer.normalize(oref.text(lang, vtitle=vtitle).text)
        temp_input_list = walker.get_input(text, tref, lang)
        input_list += temp_input_list
    srsly.write_jsonl('data/test_input.jsonl', input_list)
Exemple #16
0
def create_data(cfg: Config) -> Tuple[InputData, InputData]:
    data = list(srsly.read_jsonl(Path(cfg.path).expanduser()))
    if cfg.ndata > 0:
        data = random.sample(data, k=cfg.ndata)
    else:
        cfg.ndata = len(data)
    train, val = train_test_split(data, test_size=cfg.val_size)
    srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train)
    srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val)
    return train, val
Exemple #17
0
def main():
    text = input()
    MAX_LENGTH = 100
    lines = []
    while text:
        length = random.randint(1, min(MAX_LENGTH, len(text)))
        cur, text = text[:length], text[length:]
        ners = gen_ner_span(length)
        lines.append([cur, {"entities": ners}])
    srsly.write_jsonl("-", lines)
Exemple #18
0
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
Exemple #19
0
    def to_disk(self, path: Path) -> None:
        """Save store to disk
        
        Args:
            path (Path): Path to save store to
        """
        path = ensure_path(path)
        examples = []
        for example_hash, example in self._map.items():
            examples.append({"example_hash": example_hash, "example": example.dict()})

        srsly.write_jsonl(path, examples)
Exemple #20
0
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        srsly.write_jsonl(path, self.patterns)
Exemple #21
0
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#to_disk
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        srsly.write_jsonl(path, self.patterns)
 def tag_all(self, start=0, end=None, category='Bavli'):
     talmud = library.get_indexes_in_category(category, full_records=True)
     training = []
     mentions = []
     for mes in tqdm(talmud[start:end], desc='Books'):
         temp_training, temp_mentions = self.tag_index(mes)
         training += temp_training
         mentions += temp_mentions
     srsly.write_jsonl(
         '/home/nss/sefaria/datasets/ner/michael-sperling/en_training.jsonl',
         training)
     srsly.write_jsonl(
         '/home/nss/sefaria/datasets/ner/michael-sperling/en_mentions.jsonl',
         mentions)
Exemple #23
0
    def _build_patterns(self, skills: list, create: bool = False):
        """Build all matcher patterns"""
        patterns_path = self.data_path / "skill_patterns.jsonl"
        if not patterns_path.exists() or create:
            """Build up lists of spacy token patterns for matcher"""
            patterns = []
            split_tokens = [".", "/", "-"]

            for skill_id, skill_info in skills.items():
                aliases = skill_info['aliases']
                sources = skill_info['sources']
                skill_names = set()
                for al in aliases:
                    skill_names.add(al)
                for source in sources:
                    if "displayName" in source:
                        skill_names.add(source["displayName"])

                for name in skill_names:
                    if name.upper() == name:
                        skill_name = name
                    else:
                        skill_name = name.lower().strip()

                    if skill_name not in STOP_WORDS:
                        pattern = self._skill_pattern(skill_name)

                        if pattern:
                            label = f"SKILL|{skill_id}"
                            patterns.append({
                                "label": label,
                                "pattern": pattern
                            })

                            for t in split_tokens:
                                if t in skill_name:
                                    patterns.append({
                                        "label":
                                        label,
                                        "pattern":
                                        self._skill_pattern(skill_name, t),
                                    })

            srsly.write_jsonl(patterns_path, patterns)
            return patterns
        else:
            patterns = srsly.read_jsonl(patterns_path)
            return patterns
Exemple #24
0
def convert(
    lang: str = "en",
    input_path: Path = Path("../assets/docs_doctypes_all.xlsx"),
    sheet_name: str = "Sheet1",
    output_path: Path = Path("../assets/docs_doctypes_all.jsonl"),
    append: bool = False,
):
    # Read excel document
    df = pandas.read_excel(input_path, sheet_name=sheet_name)
    # Convert excel json
    as_json = df.to_json(orient="records")
    json_input = json.loads(as_json)
    # print(type(json_input))
    srsly.write_jsonl(path=output_path,
                      lines=json_input,
                      append=append,
                      append_new_line=True)
Exemple #25
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}

    For tokenization, you can either pass in the name of a spaCy model (e.g. if
    you're using a model with custom tokenization), or "blank:" plus the
    language code you want to use, e.g. blank:en or blank:de. Make sure to use
    the same language / tokenizer you're planning to use at runtime – otherwise
    your patterns may not match.
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    if spacy_model.startswith("blank:"):
        nlp = spacy.blank(spacy_model.replace("blank:", ""))
    else:
        nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)
    examples = DB.get_dataset(dataset)
    terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"])
    if case_sensitive:
        patterns = [[{
            "text": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    else:
        terms = set([word.lower() for word in terms])
        patterns = [[{
            "lower": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
Exemple #26
0
def test_issue_3526_3(en_vocab):
    patterns = [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
    ]
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
    with make_tempdir() as tmpdir:
        out_file = tmpdir / "entity_ruler"
        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
        new_ruler = EntityRuler(nlp).from_disk(out_file)
        for pattern in ruler.patterns:
            assert pattern in new_ruler.patterns
        assert len(new_ruler) == len(ruler)
        assert new_ruler.overwrite is not ruler.overwrite
Exemple #27
0
    def to_disk(self: SpaczzRuler, path: Union[str, Path],
                **kwargs: Any) -> None:
        """Save the spaczz ruler patterns to a directory.

        The patterns will be saved as newline-delimited JSON (JSONL).

        Args:
            path: The JSONL file to save.
            **kwargs: Other config paramters, mostly for consistency.

        Example:
            >>> import os
            >>> import tempfile
            >>> import spacy
            >>> from spaczz.pipeline import SpaczzRuler
            >>> nlp = spacy.blank("en")
            >>> ruler = SpaczzRuler(nlp)
            >>> ruler.add_patterns([{"label": "AUTHOR", "pattern": "Kerouac",
                "type": "fuzzy"}])
            >>> with tempfile.TemporaryDirectory() as tmpdir:
            >>>     ruler.to_disk(f"{tmpdir}/ruler")
            >>>     isdir = os.path.isdir(f"{tmpdir}/ruler")
            >>> isdir
            True
        """
        path = ensure_path(path)
        cfg = {
            "spaczz_overwrite": self.overwrite,
            "spaczz_defaults": self.defaults,
            "spaczz_ent_id_sep": self.ent_id_sep,
        }
        serializers = {
            "spaczz_patterns":
            lambda p: srsly.write_jsonl(p.with_suffix(".jsonl"), self.patterns
                                        ),
            "cfg":
            lambda p: srsly.write_json(p, cfg),
        }
        if path.suffix == ".jsonl":  # user wants to save only JSONL
            srsly.write_jsonl(path, self.patterns)
        else:
            write_to_disk(path, serializers, {})
    def save(self):
        os.makedirs(self.data_dir, exist_ok=True)
        self.tokenizer.save_pretrained(self.data_dir)

        with open(self.classes_path, "w") as out_fp:
            json.dump(self.label_to_id, out_fp)

        with open(self.dataset_sizes_path, "w") as out_fp:
            json.dump(
                {section: len(texts)
                 for section, texts in self.texts.items()}, out_fp)

        for section, texts in self.texts.items():
            if section == "train":
                # sort documents by the number of sentences for faster training
                texts = sorted(texts,
                               key=lambda x: len(x["sentences"]),
                               reverse=True)
            srsly.write_jsonl(os.path.join(self.data_dir, section + ".jsonl"),
                              texts)
Exemple #29
0
def make_prodigy_input(title_list, vtitle_list, lang_list, prev_tagged_refs):
    walker = ProdigyInputWalker(prev_tagged_refs)
    for title, vtitle, lang in tqdm(zip(title_list, vtitle_list, lang_list),
                                    total=len(title_list)):
        if vtitle is None:
            version = VersionSet({
                "title": title,
                "language": lang
            },
                                 sort=[("priority", -1)],
                                 limit=1).array()[0]
        else:
            version = Version().load({
                "title": title,
                "versionTitle": vtitle,
                "language": lang
            })
        version.walk_thru_contents(walker.action)
    walker.make_final_input(400)
    srsly.write_jsonl('data/test_input.jsonl', walker.prodigyInput)
Exemple #30
0
def _main(cfg):
    cfg = parse(cfg)
    if cfg.seed:
        set_seed(cfg.seed)
    org_cwd = hydra.utils.get_original_cwd()
    logger.info(cfg.pretty())
    nlp = cast(TorchLanguage, create_model(cfg.model))
    train_data = list(
        srsly.read_jsonl(os.path.join(org_cwd, cfg.train.data.train)))
    cfg.train.data.ndata = len(train_data)
    val_data = list(srsly.read_jsonl(os.path.join(org_cwd,
                                                  cfg.train.data.val)))
    logger.info("output dir: {}".format(os.getcwd()))
    if torch.cuda.is_available():
        logger.info("CUDA enabled")
        nlp.to(torch.device("cuda"))
    savedir = Path.cwd() / "models"
    srsly.write_jsonl(Path.cwd() / f"train-data.jsonl", train_data)
    srsly.write_jsonl(Path.cwd() / f"val-data.jsonl", val_data)
    savedir.mkdir(exist_ok=True)
    train(cfg.train, nlp, train_data, val_data, savedir)
Exemple #31
0
def make_evaluation_files(evaluation_data, ner_model, output_folder, start=0, lang='he'):
    tp,fp,fn,tn = 0,0,0,0
    data_tuples = [(eg.text, eg) for eg in evaluation_data]
    output_json = []
    # see https://spacy.io/api/language#pipe
    for iexample, (doc, example) in enumerate(tqdm(ner_model.pipe(data_tuples, as_tuples=True))):
        if iexample < start: continue
        # correct_ents
        ents_x2y = example.get_aligned_spans_x2y(example.reference.ents)
        correct_ents = {(e.start_char, e.end_char, e.label_) for e in ents_x2y}
        # predicted_ents
        ents_x2y = example.get_aligned_spans_x2y(doc.ents)
        predicted_ents = {(e.start_char, e.end_char, e.label_) for e in ents_x2y}
        # false positives
        temp_fp = [ent for ent in predicted_ents if ent not in correct_ents]
        fp += len(temp_fp)
        # true positives
        temp_tp = [ent for ent in predicted_ents if ent in correct_ents]
        tp += len(temp_tp)
        # false negatives
        temp_fn = [ent for ent in correct_ents if ent not in predicted_ents]
        fn += len(temp_fn)
        # true negatives
        temp_tn = [ent for ent in correct_ents if ent in predicted_ents]
        tn += len(temp_tn)
        output_json += [{
            "text": doc.text,
            "tp": [list(ent) for ent in temp_tp],
            "fp": [list(ent) for ent in temp_fp],
            "fn": [list(ent) for ent in temp_fn],
            "ref": example.predicted.user_data['Ref'],
            "_id": example.predicted.user_data['_id'],
        }]
    
    srsly.write_jsonl(f"{output_folder}/doc_evaluation.jsonl", output_json)
    make_evaluation_html(output_json, output_folder, 'doc_evaluation.html', lang)
    print('PRECISION', 100*round(tp/(tp+fp), 4))
    print('RECALL   ', 100*round(tp/(tp+fn), 4))
    print('F1       ', 100*round(tp/(tp + 0.5 * (fp + fn)),4))
    return tp, fp, tn, fn