def model_stats(dataset, spacy_model, label=None, isPrf=False): """ Evaluate model accuracy of model based on dataset with no training inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2 found on https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193 got basic model evaluation by looking at the batch-train recipe """ log("RECIPE: Starting recipe ner.stats", locals()) DB = connect() nlp = spacy.load(spacy_model) isPrf = 'True' if (isPrf): examples = gold_to_spacy(dataset, spacy_model) score = evaluate_prf(nlp, examples) print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format( score['ents_p'], score['ents_r'], score['ents_f'])) else: # ripped this from ner.batch-train recipe model = EntityRecognizer(nlp, label=label) evaldoc = merge_spans(DB.get_dataset(dataset)) evals = list(split_sentences(model.orig_nlp, evaldoc)) scores = model.evaluate(evals) print( "Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}" .format(scores['acc'], scores['right'], scores['wrong'], scores['unk'], scores['ents']))
def ner_silver_to_gold( silver_dataset: str, gold_dataset: str, spacy_model: str, label: Optional[List[str]] = None, ): """ Take an existing "silver" dataset with binary accept/reject annotations, merge the annotations to find the best possible analysis given the constraints defined in the annotations, and manually edit it to create a perfect and complete "gold" dataset. """ # Connect to the database using the settings from prodigy.json, check # that the silver dataset exists and load it DB = connect() if silver_dataset not in DB: raise ValueError("Can't find dataset '{}'.".format(silver_dataset)) silver_data = DB.get_dataset(silver_dataset) # Load the spaCy model nlp = spacy.load(spacy_model) if label is None: # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON ner = nlp.get_pipe("ner") label = sorted(ner.labels) # Initialize Prodigy's entity recognizer model, which uses beam search to # find all possible analyses and outputs (score, example) tuples model = EntityRecognizer(nlp, label=label) # Merge all annotations and find the best possible analyses stream = model.make_best(silver_data) # Tokenize the incoming examples and add a "tokens" property to each # example. Also handles pre-defined selected spans. Tokenization allows # faster highlighting, because the selection can "snap" to token boundaries. stream = add_tokens(nlp, stream) return { "view_id": "ner_manual", # Annotation interface to use "dataset": gold_dataset, # Name of dataset to save annotations "stream": stream, # Incoming stream of examples "config": { # Additional config settings, mostly for app UI "lang": nlp.lang, "labels": label, # Selectable label options }, }
def ner_silver_to_gold(silver_dataset, gold_dataset, spacy_model, label=[]): """ Take an existing "silver" dataset with binary accept/reject annotations, merge the annotations to find the best possible analysis given the constraints defined in the annotations, and manually edit it to create a perfect and complete "gold" dataset. """ # Connect to the database using the settings from prodigy.json, check # that the silver dataset exists and load it DB = connect() if silver_dataset not in DB: raise ValueError("Can't find dataset '{}'.".format(silver_dataset)) silver_data = DB.get_dataset(silver_dataset) # Load the spaCy model nlp = spacy.load(spacy_model) if not label: # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON ner = nlp.get_pipe('ner') moves = ner.move_names label = [ move.split('-')[1] for move in moves if move[0] in ('B', 'I', 'L', 'U') ] label = sorted(set(label)) # Initialize Prodigy's entity recognizer model, which uses beam search to # find all possible analyses and outputs (score, example) tuples model = EntityRecognizer(nlp, label=label) # Merge all annotations and find the best possible analyses stream = model.make_best(silver_data) # Tokenize the incoming examples and add a "tokens" property to each # example. Also handles pre-defined selected spans. Tokenization allows # faster highlighting, because the selection can "snap" to token boundaries. stream = add_tokens(nlp, stream) return { 'view_id': 'ner_manual', # Annotation interface to use 'dataset': gold_dataset, # Name of dataset to save annotations 'stream': stream, # Incoming stream of examples 'config': { # Additional config settings, mostly for app UI 'lang': nlp.lang, 'labels': label # Selectable label options } }
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc): # Load the NLP and KB objects from file nlp = spacy.load(nlp_dir) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1) kb.load_bulk(kb_loc) model = EntityRecognizer(nlp) # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions id_dict = dict() with entity_loc.open("r", encoding="utf8") as csvfile: csvreader = csv.reader(csvfile, delimiter=",") for row in csvreader: id_dict[row[0]] = (row[1], row[2]) # Initialize the Prodigy stream by running the NER model stream = TXT(source) stream = [set_hashes(eg) for eg in stream] stream = (eg for score, eg in model(stream)) # For each NER mention, add the candidates from the KB to the annotation task stream = _add_options(stream, kb, id_dict) stream = filter_duplicates(stream, by_input=True, by_task=False) return { "dataset": dataset, "stream": stream, "view_id": "choice", "config": { "choice_auto_accept": True }, }
def ner_teach( dataset: str, spacy_model: str, source: str, label: Optional[List[str]] = None, patterns: Optional[str] = None, exclude: Optional[List[str]] = None, unsegmented: bool = False, ): """ Collect the best possible training data for a named entity recognition model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # Load the stream from a JSONL file and return a generator that yields a # dictionary for each example in the data. stream = JSONL(source) # Load the spaCy model nlp = spacy.load(spacy_model) # Initialize Prodigy's entity recognizer model, which uses beam search to # find all possible analyses and outputs (score, example) tuples model = EntityRecognizer(nlp, label=label) if patterns is None: # No patterns are used, so just use the NER model to suggest examples # and only use the model's update method as the update callback predict = model update = model.update else: # Initialize the pattern matcher and load in the JSONL patterns matcher = PatternMatcher(nlp).from_disk(patterns) # Combine the NER model and the matcher and interleave their # suggestions and update both at the same time predict, update = combine_models(model, matcher) if not unsegmented: # Use spaCy to split text into sentences stream = split_sentences(nlp, stream) # Use the prefer_uncertain sorter to focus on suggestions that the model # is most uncertain about (i.e. with a score closest to 0.5). The model # yields (score, example) tuples and the sorter yields just the example stream = prefer_uncertain(predict(stream)) return { "view_id": "ner", # Annotation interface to use "dataset": dataset, # Name of dataset to save annotations "stream": stream, # Incoming stream of examples "update": update, # Update callback, called with batch of answers "exclude": exclude, # List of dataset names to exclude "config": { "lang": nlp.lang }, # Additional config settings, mostly for app UI }
else: nlp.add_pipe(ner) ner.begin_training([]) else: ner = nlp.get_pipe("ner") for label in all_labels: ner.add_label(label) random.shuffle(examples) train_examples, evals, eval_split = split_evals( merged_examples, eval_split) st.success( f"✅ Using **{len(train_examples)}** training examples " f"and **{len(evals)}** evaluation examples with " f"**{len(all_labels)}** label(s)") annot_model = EntityRecognizer(nlp, label=all_labels, no_missing=no_missing) batch_size = guess_batch_size(len(train_examples)) baseline = annot_model.evaluate(evals) st.info( f"ℹ️ **Baseline**\n**{baseline['right']:.0f}** right " f"entities, **{baseline['wrong']:.0f}** wrong entities, " f"**{baseline['unk']:.0f}** unkown entities, " f"**{baseline['ents']:.0f}** total predicted, " f"**{baseline['acc']:.2f}** accuracy") progress = st.progress(0) results = [] result_table = st.empty() best_acc = 0.0 for i in range(n_iter): random.shuffle(train_examples)
def ner_teach(dataset, spacy_model, source=None, label=None, patterns=None, exclude=None, unsegmented=False): """ Collect the best possible training data for a named entity recognition model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # Load the stream from a JSONL file and return a generator that yields a # dictionary for each example in the data. stream = JSONL(source) # Load the spaCy model nlp = spacy.load(spacy_model) # Initialize Prodigy's entity recognizer model, which uses beam search to # find all possible analyses and outputs (score, example) tuples model = EntityRecognizer(nlp, label=label) if patterns is None: # No patterns are used, so just use the NER model to suggest examples # and only use the model's update method as the update callback predict = model update = model.update else: # Initialize the pattern matcher and load in the JSONL patterns matcher = PatternMatcher(nlp).from_disk(patterns) # Combine the NER model and the matcher and interleave their # suggestions and update both at the same time predict, update = combine_models(model, matcher) if not unsegmented: # Use spaCy to split text into sentences stream = split_sentences(nlp, stream) def auto_skip(stream): predictions = predict(stream) print(predictions) return prefer_uncertain(predictions) # Use the prefer_uncertain sorter to focus on suggestions that the model # is most uncertain about (i.e. with a score closest to 0.5). The model # yields (score, example) tuples and the sorter yields just the example stream = auto_skip(stream) with open('src/prodigy/ner_teach.js') as ner_teach_js_file: ner_teach_js = ner_teach_js_file.read() return { 'view_id': 'ner', # Annotation interface to use 'dataset': dataset, # Name of dataset to save annotations 'stream': stream, # Incoming stream of examples 'update': update, # Update callback, called with batch of answers 'exclude': exclude, # List of dataset names to exclude 'config': { # Additional config settings, mostly for app UI 'lang': nlp.lang, 'label': ', '.join(label) if label is not None else 'all', 'javascript': ner_teach_js } }