def make_tasks(nlp, stream, labels): """Add a 'spans' key to each example, with predicted entities.""" # Process the stream using spaCy's nlp.pipe, which yields doc objects. # If as_tuples=True is set, you can pass in (text, context) tuples. texts = ((eg["text"], eg) for eg in stream) for doc, eg in nlp.pipe(texts, as_tuples=True): task = copy.deepcopy(eg) spans = [] for ent in doc.ents: # Continue if predicted entity is not selected in labels if labels and ent.label_ not in labels: continue # Create span dict for the predicted entitiy spans.append({ "token_start": ent.start, "token_end": ent.end - 1, "start": ent.start_char, "end": ent.end_char, "text": ent.text, "label": ent.label_, }) task["spans"] = spans # Rehash the newly created task so that hashes reflect added data task = set_hashes(task) yield task
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc): # Load the NLP and KB objects from file nlp = spacy.load(nlp_dir) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1) kb.load_bulk(kb_loc) model = EntityRecognizer(nlp) # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions id_dict = dict() with entity_loc.open("r", encoding="utf8") as csvfile: csvreader = csv.reader(csvfile, delimiter=",") for row in csvreader: id_dict[row[0]] = (row[1], row[2]) # Initialize the Prodigy stream by running the NER model stream = TXT(source) stream = [set_hashes(eg) for eg in stream] stream = (eg for score, eg in model(stream)) # For each NER mention, add the candidates from the KB to the annotation task stream = _add_options(stream, kb, id_dict) stream = filter_duplicates(stream, by_input=True, by_task=False) return { "dataset": dataset, "stream": stream, "view_id": "choice", "config": { "choice_auto_accept": True }, }
def ner_translate( in_sets: List[str], out_set: str, model_name_or_path: str, source_lang: str, target_lang: str, dry: bool = False, ) -> None: translator = TransformersMarianTranslator( model_name_or_path, source_lang=source_lang, target_lang=target_lang ) DB = connect() for set_id in in_sets: if set_id not in DB: msg.fail(f"Can't find dataset '{set_id}' in database", exits=1) if out_set in DB and len(DB.get_dataset(out_set)): msg.fail( f"Output dataset '{out_set}' already exists and includes examples", f"This can lead to unexpected results. Please use a new dataset.", exits=1, ) if out_set not in DB: if not dry: DB.add_dataset(out_set) msg.good(f"Created dataset '{out_set}'") matched_examples_t = [] mismatched_examples_t = [] for set_id in in_sets: msg.text(f"RECIPE: Translating and merging examples from '{set_id}'") raw_examples = DB.get_dataset(set_id) examples = [Example(**e) for e in raw_examples] examples_t = translate_ner_batch( examples, translate_f=translator.pipe, target_lang=target_lang ) for e, e_t in zip(examples, examples_t): if len(e.spans) != len(e_t.spans): mismatched_examples_t.append(e_t) else: matched_examples_t.append(e_t) msg.text(f"RECIPE: Translated {len(matched_examples_t)} examples from '{set_id}'") msg.text( f"RECIPE: Found {len(mismatched_examples_t)} examples with mismatched spans after translation from '{set_id}'" ) matched_examples_t = set_hashes(matched_examples_t) dry = False if not dry: DB.add_examples(matched_examples_t, datasets=[out_set]) msg.good( f"Translated and merged {len(matched_examples_t)} examples from {len(in_sets)} datasets", f"Created translated and merged dataset '{out_set}'", )
def pipe(source=None, api=None, loader=None, from_dataset=False, exclude=None): """ Load examples from an input source, and print them as newline-delimited JSON. This makes it easy to filter the stream with command-line utilities such as `grep`. It's also often useful to inspect the stream, by piping to `less`. """ DB = connect() if from_dataset: stream = DB.get_dataset(source) else: stream = get_stream(source, api, loader) stream = (set_hashes(eg) for eg in stream) if exclude: log("RECIPE: Excluding tasks from datasets: {}".format( ', '.join(exclude))) exclude_hashes = DB.get_input_hashes(*exclude) stream = filter_inputs(stream, exclude_hashes) try: for eg in stream: print(ujson.dumps(eg, escape_forward_slashes=False)) except KeyboardInterrupt: pass
def to_prodigy( examples: List[Example], prodigy_dataset: str, overwrite_dataset: bool = False, add_hash: bool = True, ) -> None: """Save a list of examples to Prodigy Args: examples (List[Example]): Input examples prodigy_dataset (str): Name of Prodigy dataset to save to overwrite_dataset (bool, optional): If True will overwrite all data in Prodigy Raises: ValueError: If trying to save examples to an existing dataset without explicitly setting overwrite_dataset to True """ from prodigy.core import connect from prodigy.util import set_hashes db = connect() if db.get_dataset(prodigy_dataset): if overwrite_dataset: db.drop_dataset(prodigy_dataset) db.add_dataset(prodigy_dataset) else: raise ValueError(f"Prodigy dataset {prodigy_dataset} already exists.") else: db.add_dataset(prodigy_dataset) examples = [e.dict() for e in examples] prodigy_examples = [] for e in examples: prodigy_examples.append(set_hashes(e)) db.add_examples(prodigy_examples, [prodigy_dataset])
def db_in(set_id, in_file, loader=None, answer='accept', overwrite=False, dry=False): """ Import annotations to the database. Supports all formats loadable by Prodigy. """ DB = connect() if not in_file.exists() or not in_file.is_file(): prints("Not a valid input file.", in_file, exits=1, error=True) if set_id not in DB: prints("Can't find '{}' in database {}.".format(set_id, DB.db_name), "Maybe you misspelled the name or forgot to add the dataset " "using the `dataset` command?", exits=1, error=True) loader = get_loader(loader, file_path=in_file) annotations = loader(in_file) annotations = [set_hashes(eg) for eg in annotations] added_answers = 0 for task in annotations: if 'answer' not in task or overwrite: task['answer'] = answer added_answers += 1 session_id = get_timestamp_session_id() if not dry: DB.add_dataset(session_id, session=True) DB.add_examples(annotations, datasets=[set_id, session_id]) prints( "Imported {} annotations for '{}' to database {}".format( len(annotations), set_id, DB.db_name), "Added '{}' answer to {} annotations".format(answer, added_answers), "Session ID: {}".format(session_id))
def teach( dataset, vectors_path, seeds, threshold=0.85, n_similar=100, batch_size=5, case_sensitive=False, resume=False, ): """ Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. If no similar terms are found above the given threshold, the threshold is lowered by 0.1 and similar terms are requested again. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>" accept_keys = [] seen = set() seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) if key is None: msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) accept_keys.append(key) best_word, best_sense = s2v.split_key(key) seen.add(best_word if case_sensitive else best_word.lower()) task = { "text": key, "word": best_word, "sense": best_sense, "meta": { "score": 1.0, "sense": best_sense }, "answer": "accept", } seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: DB.add_dataset(dataset) dataset_hashes = DB.get_task_hashes(dataset) DB.add_examples( [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], datasets=[dataset], ) if resume: prev = DB.get_dataset(dataset) prev_accept_keys = [ eg["text"] for eg in prev if eg["answer"] == "accept" ] prev_words = [ eg["word"] if case_sensitive else eg["word"].lower() for eg in prev ] accept_keys += prev_accept_keys seen.update(set(prev_words)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}" ) def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" nonlocal threshold while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") n_skipped = 0 n_duplicate = 0 for key, score in most_similar: if score > threshold: word, sense = s2v.split_key(key) if (case_sensitive and word in seen) or (not case_sensitive and word.lower() in seen): n_duplicate += 1 continue seen.add(word if case_sensitive else word.lower()) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score), "sense": sense} yield { "text": key, "word": word, "sense": sense, "meta": meta } else: n_skipped += 1 if n_skipped: log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}" ) if n_skipped == len(most_similar) - n_duplicate: # No most similar phrases were found that are above the # threshold, so lower the threshold if it's not already 0 or # return empty list so Prodigy shows "no tasks available" new_threshold = threshold - 0.1 if new_threshold <= 0.0: log(f"RECIPE: No suggestions for threshold {threshold:.2}") return [] log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}" ) threshold = new_threshold stream = get_stream() return { "view_id": "html", "dataset": dataset, "stream": stream, "update": update, "config": { "batch_size": batch_size, "html_template": html_template }, }
def teach( dataset, vectors_path, seeds, threshold=0.85, n_similar=20, batch_size=5, resume=False, ): """ Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>" accept_keys = [] seen = set(accept_keys) seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) if key is None: raise ValueError(f"Can't find seed term '{seed}' in vectors") accept_keys.append(key) best_word, best_sense = s2v.split_key(key) task = { "text": key, "word": best_word, "sense": best_sense, "meta": { "score": 1.0 }, "answer": "accept", } seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: DB.add_dataset(dataset) dataset_hashes = DB.get_task_hashes(dataset) DB.add_examples( [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], datasets=[dataset], ) if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] accept_keys += prev_accept seen.update(set(accept_keys)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}" ) def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) word, sense = s2v.split_key(key) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score)} yield { "text": key, "word": word, "sense": sense, "meta": meta } stream = get_stream() return { "view_id": "html", "dataset": dataset, "stream": stream, "update": update, "config": { "batch_size": batch_size, "html_template": html_template }, }
def teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): """ Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec """ SENSES = [ "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE" ] log("RECIPE: Starting recipe phrases.to-patterns", locals()) LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load(vectors_path) log("RECIPE: Finished loading sense2vec", locals()) # Seems to be a bug in sense2vec which gets < n similar senses not <= n batch_size = min(batch_size, top_n * len(seeds)) top_n = top_n + 1 DB = connect() seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] DB.add_examples(seed_tasks, datasets=[dataset]) accept_phrases = seeds reject_phrases = [] seen = set(accept_phrases) sensed = set() if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] accept_phrases += prev_accept reject_phrases += prev_reject seen.update(set(accept_phrases)) seen.update(set(reject_phrases)) log("RECIPE: Resuming from {} previous examples in dataset {}".format( len(prev), dataset)) def format_for_s2v(word, sense): return word.replace(" ", "_") + "|" + sense def get_best(word, sense): if sense != "auto": # if sense is specified, find respective entry if format_for_s2v(word, sense) in S2V: return (word, sense) return (None, None) freqs = [] casings = [word, word.upper(), word.title() ] if word.islower() else [word] for text in casings: # try options for tag in SENSES: query = format_for_s2v(text, tag) if query in S2V: freqs.append((S2V[query][0], (text, tag))) return max(freqs)[1] if freqs else (None, None) def get_similar(word, sense, n=100): query = format_for_s2v(word, sense) if query not in S2V: return [] freq, query_vector = S2V[query] words, scores = S2V.most_similar(query_vector, n) words = [word.rsplit("|", 1) for word in words] # Don't know why we'd be getting unsensed entries, but fix. words = [entry for entry in words if len(entry) == 2] words = [(word.replace("_", " "), sense) for word, sense in words] return zip(words, scores) def find_similar(word: str, sense: str = "auto", n_results: int = top_n): """Find similar terms for a given term and optional sense.""" best_word, best_sense = get_best(word, sense) results = [] if not word or not best_word: return results seen = set([best_word, min(LEMMATIZER(best_word, best_sense))]) similar = get_similar(best_word, best_sense, n_results) for (word_entry, sense_entry), score in similar: head = min(LEMMATIZER(word_entry, sense_entry)) if head not in seen and score > threshold: freq, _ = S2V[format_for_s2v(word_entry, sense_entry)] results.append((score, word_entry)) seen.add(head) if len(results) >= n_results: break return results def update(answers): """Updates accept_phrases so that the stream can find new phrases""" for answer in answers: if answer['answer'] == 'accept': accept_phrases.append(answer['text']) elif answer['answer'] == 'reject': reject_phrases.append(answer['text']) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter""" while True: seen.update(set([rp.lower() for rp in reject_phrases])) for p in accept_phrases: if p.lower() not in sensed: sensed.add(p.lower()) for score, phrase in find_similar(p): if phrase.lower() not in seen: seen.add(phrase.lower()) yield {"text": phrase, 'meta': {'score': score}} stream = get_stream() return { 'view_id': 'text', 'dataset': dataset, 'stream': stream, 'update': update, 'config': { 'batch_size': batch_size } }
def phrases_teach(dataset, seeds, threshold=0.85, batch_size=5, resume=False): """ Bootstrap a terminology list with word vectors and seeds terms. Prodigy will suggest similar terms based on the word vectors, and update the target vector accordingly. """ DB = connect() seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] DB.add_examples(seed_tasks, datasets=[dataset]) accept_phrases = seeds reject_phrases = [] seen = set(accept_phrases) sensed = set() if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] accept_phrases += prev_accept reject_phrases += prev_reject seen.update(set(accept_phrases)) seen.update(set(reject_phrases)) def sense2vec(phrase, threshold): """Call sense2vec API to get similar "senses" (phrases)""" res = requests.post(API_URL, { "sense": "auto", "word": phrase }) results = res.json()["results"] output = [] for r in results: if r["score"] > threshold or len(output) <= 10: output.append((r["score"], r["text"])) return output def update(answers): """Updates accept_phrases so that the stream can find new phrases""" for answer in answers: if answer['answer'] == 'accept': accept_phrases.append(answer['text']) elif answer['answer'] == 'reject': reject_phrases.append(answer['text']) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter""" while True: seen.update(set([rp.lower() for rp in reject_phrases])) for p in accept_phrases: if p.lower() not in sensed: sensed.add(p.lower()) for score, phrase in sense2vec(p, threshold): if phrase.lower() not in seen: seen.add(phrase.lower()) yield score, {"text": phrase, 'meta': {'score': score}} stream = Probability(get_stream()) return { 'view_id': 'text', 'dataset': dataset, 'stream': stream, 'update': update, 'config': { "batch_size": batch_size } }
def terms_teach(dataset, vectors, seeds): """ Bootstrap a terminology list with word vectors and seeds terms. Prodigy will suggest similar terms based on the word vectors, and update the target vector accordingly. """ # Connect to the database using the settings from prodigy.json and add the # seed terms to the dataset DB = connect() if dataset and dataset in DB: seed_tasks = [set_hashes({'text': s, 'answer': 'accept'}) for s in seeds] DB.add_examples(seed_tasks, datasets=[dataset]) # Load the spaCy model with vectors nlp = spacy.load(vectors) # Create two Doc objects for the accepted and rejected terms accept_doc = Doc(nlp.vocab, words=seeds) reject_doc = Doc(nlp.vocab, words=[]) score = 0 def predict(term): """Score a term given the current accept_doc and reject_doc.""" if len(accept_doc) == 0 and len(reject_doc) == 0: return 0.5 # Use spaCy's .similarity() method to compare the term to the # accepted and rejected Doc accept_score = max(term.similarity(accept_doc), 0.0) reject_score = max(term.similarity(reject_doc), 0.0) score = accept_score / (accept_score + reject_score + 0.2) return max(score, 0.0) def update(answers): # Called whenever Prodigy receives new annotations nonlocal accept_doc, reject_doc, score accept_words = [t.text for t in accept_doc] reject_words = [t.text for t in reject_doc] for answer in answers: # Increase or decrease score depending on answer and update # list of accepted and rejected terms if answer['answer'] == 'accept': score += 1 accept_words.append(answer['text']) elif answer['answer'] == 'reject': score -= 1 reject_words.append(answer['text']) # Update the target documents in place accept_doc = Doc(nlp.vocab, words=accept_words) reject_doc = Doc(nlp.vocab, words=reject_words) def score_stream(stream): # Get all lexemes in the vocab and score them lexemes = [lex for lex in stream if lex.is_alpha and lex.is_lower] while True: seen = set(w.orth for w in accept_doc) seen.update(set(w.orth for w in reject_doc)) lexemes = [w for w in lexemes if w.orth not in seen] by_score = [(predict(lex), lex) for lex in lexemes] by_score.sort(reverse=True) for _, term in by_score: score = predict(term) # Return (score, example) tuples for the scored terms yield score, {'text': term.text, 'meta': {'score': score}} # Sort the scored vocab by probability and return examples stream = Probability(score_stream(nlp.vocab)) return { 'view_id': 'text', # Annotation interface to use 'dataset': dataset, # Name of dataset to save annotations 'stream': stream, # Incoming stream of examples 'update': update, # Update callback, called with answers }