Beispiel #1
0
def model_stats(dataset, spacy_model, label=None, isPrf=False):
    """
    Evaluate model accuracy of model based on dataset with no training
    inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2
    found on https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193
    got basic model evaluation by looking at the batch-train recipe
    """

    log("RECIPE: Starting recipe ner.stats", locals())
    DB = connect()
    nlp = spacy.load(spacy_model)

    isPrf = 'True'
    if (isPrf):
        examples = gold_to_spacy(dataset, spacy_model)
        score = evaluate_prf(nlp, examples)
        print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format(
            score['ents_p'], score['ents_r'], score['ents_f']))

    else:
        # ripped this from ner.batch-train recipe
        model = EntityRecognizer(nlp, label=label)
        evaldoc = merge_spans(DB.get_dataset(dataset))
        evals = list(split_sentences(model.orig_nlp, evaldoc))

        scores = model.evaluate(evals)

        print(
            "Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}"
            .format(scores['acc'], scores['right'], scores['wrong'],
                    scores['unk'], scores['ents']))
def ner_silver_to_gold(
    silver_dataset: str,
    gold_dataset: str,
    spacy_model: str,
    label: Optional[List[str]] = None,
):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if label is None:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe("ner")
        label = sorted(ner.labels)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": gold_dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }
Beispiel #3
0
def ner_silver_to_gold(silver_dataset, gold_dataset, spacy_model, label=[]):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if not label:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe('ner')
        moves = ner.move_names
        label = [
            move.split('-')[1] for move in moves
            if move[0] in ('B', 'I', 'L', 'U')
        ]
        label = sorted(set(label))

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': gold_dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'labels': label  # Selectable label options
        }
    }
Beispiel #4
0
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    # Load the NLP and KB objects from file
    nlp = spacy.load(nlp_dir)
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.load_bulk(kb_loc)
    model = EntityRecognizer(nlp)

    # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
    id_dict = dict()
    with entity_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])

    # Initialize the Prodigy stream by running the NER model
    stream = TXT(source)
    stream = [set_hashes(eg) for eg in stream]
    stream = (eg for score, eg in model(stream))

    # For each NER mention, add the candidates from the KB to the annotation task
    stream = _add_options(stream, kb, id_dict)
    stream = filter_duplicates(stream, by_input=True, by_task=False)

    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice",
        "config": {
            "choice_auto_accept": True
        },
    }
Beispiel #5
0
def ner_teach(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    unsegmented: bool = False,
):
    """
    Collect the best possible training data for a named entity recognition
    model with the model in the loop. Based on your annotations, Prodigy will
    decide which questions to ask next.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    if patterns is None:
        # No patterns are used, so just use the NER model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns
        matcher = PatternMatcher(nlp).from_disk(patterns)
        # Combine the NER model and the matcher and interleave their
        # suggestions and update both at the same time
        predict, update = combine_models(model, matcher)

    if not unsegmented:
        # Use spaCy to split text into sentences
        stream = split_sentences(nlp, stream)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = prefer_uncertain(predict(stream))

    return {
        "view_id": "ner",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": update,  # Update callback, called with batch of answers
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }
     else:
         nlp.add_pipe(ner)
     ner.begin_training([])
 else:
     ner = nlp.get_pipe("ner")
 for label in all_labels:
     ner.add_label(label)
 random.shuffle(examples)
 train_examples, evals, eval_split = split_evals(
     merged_examples, eval_split)
 st.success(
     f"✅ Using **{len(train_examples)}** training examples "
     f"and **{len(evals)}** evaluation examples with "
     f"**{len(all_labels)}** label(s)")
 annot_model = EntityRecognizer(nlp,
                                label=all_labels,
                                no_missing=no_missing)
 batch_size = guess_batch_size(len(train_examples))
 baseline = annot_model.evaluate(evals)
 st.info(
     f"ℹ️ **Baseline**\n**{baseline['right']:.0f}** right "
     f"entities, **{baseline['wrong']:.0f}** wrong entities, "
     f"**{baseline['unk']:.0f}** unkown entities, "
     f"**{baseline['ents']:.0f}** total predicted, "
     f"**{baseline['acc']:.2f}** accuracy")
 progress = st.progress(0)
 results = []
 result_table = st.empty()
 best_acc = 0.0
 for i in range(n_iter):
     random.shuffle(train_examples)
Beispiel #7
0
def ner_teach(dataset,
              spacy_model,
              source=None,
              label=None,
              patterns=None,
              exclude=None,
              unsegmented=False):
    """
    Collect the best possible training data for a named entity recognition
    model with the model in the loop. Based on your annotations, Prodigy will
    decide which questions to ask next.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    if patterns is None:
        # No patterns are used, so just use the NER model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns
        matcher = PatternMatcher(nlp).from_disk(patterns)
        # Combine the NER model and the matcher and interleave their
        # suggestions and update both at the same time
        predict, update = combine_models(model, matcher)

    if not unsegmented:
        # Use spaCy to split text into sentences
        stream = split_sentences(nlp, stream)

    def auto_skip(stream):
        predictions = predict(stream)
        print(predictions)
        return prefer_uncertain(predictions)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = auto_skip(stream)

    with open('src/prodigy/ner_teach.js') as ner_teach_js_file:
        ner_teach_js = ner_teach_js_file.read()

    return {
        'view_id': 'ner',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'update': update,  # Update callback, called with batch of answers
        'exclude': exclude,  # List of dataset names to exclude
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'label': ', '.join(label) if label is not None else 'all',
            'javascript': ner_teach_js
        }
    }