Exemple #1
0
def make_tasks(nlp, stream, labels):
    """Add a 'spans' key to each example, with predicted entities."""
    # Process the stream using spaCy's nlp.pipe, which yields doc objects.
    # If as_tuples=True is set, you can pass in (text, context) tuples.
    texts = ((eg["text"], eg) for eg in stream)
    for doc, eg in nlp.pipe(texts, as_tuples=True):
        task = copy.deepcopy(eg)
        spans = []
        for ent in doc.ents:
            # Continue if predicted entity is not selected in labels
            if labels and ent.label_ not in labels:
                continue
            # Create span dict for the predicted entitiy
            spans.append({
                "token_start": ent.start,
                "token_end": ent.end - 1,
                "start": ent.start_char,
                "end": ent.end_char,
                "text": ent.text,
                "label": ent.label_,
            })
        task["spans"] = spans
        # Rehash the newly created task so that hashes reflect added data
        task = set_hashes(task)
        yield task
Exemple #2
0
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    # Load the NLP and KB objects from file
    nlp = spacy.load(nlp_dir)
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.load_bulk(kb_loc)
    model = EntityRecognizer(nlp)

    # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
    id_dict = dict()
    with entity_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])

    # Initialize the Prodigy stream by running the NER model
    stream = TXT(source)
    stream = [set_hashes(eg) for eg in stream]
    stream = (eg for score, eg in model(stream))

    # For each NER mention, add the candidates from the KB to the annotation task
    stream = _add_options(stream, kb, id_dict)
    stream = filter_duplicates(stream, by_input=True, by_task=False)

    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice",
        "config": {
            "choice_auto_accept": True
        },
    }
Exemple #3
0
def ner_translate(
    in_sets: List[str],
    out_set: str,
    model_name_or_path: str,
    source_lang: str,
    target_lang: str,
    dry: bool = False,
) -> None:
    translator = TransformersMarianTranslator(
        model_name_or_path, source_lang=source_lang, target_lang=target_lang
    )

    DB = connect()
    for set_id in in_sets:
        if set_id not in DB:
            msg.fail(f"Can't find dataset '{set_id}' in database", exits=1)
    if out_set in DB and len(DB.get_dataset(out_set)):
        msg.fail(
            f"Output dataset '{out_set}' already exists and includes examples",
            f"This can lead to unexpected results. Please use a new dataset.",
            exits=1,
        )
    if out_set not in DB:
        if not dry:
            DB.add_dataset(out_set)
        msg.good(f"Created dataset '{out_set}'")

    matched_examples_t = []
    mismatched_examples_t = []

    for set_id in in_sets:
        msg.text(f"RECIPE: Translating and merging examples from '{set_id}'")
        raw_examples = DB.get_dataset(set_id)
        examples = [Example(**e) for e in raw_examples]
        examples_t = translate_ner_batch(
            examples, translate_f=translator.pipe, target_lang=target_lang
        )
        for e, e_t in zip(examples, examples_t):
            if len(e.spans) != len(e_t.spans):
                mismatched_examples_t.append(e_t)
            else:
                matched_examples_t.append(e_t)

        msg.text(f"RECIPE: Translated {len(matched_examples_t)} examples from '{set_id}'")
        msg.text(
            f"RECIPE: Found {len(mismatched_examples_t)} examples with mismatched spans after translation from '{set_id}'"
        )

    matched_examples_t = set_hashes(matched_examples_t)

    dry = False
    if not dry:
        DB.add_examples(matched_examples_t, datasets=[out_set])
    msg.good(
        f"Translated and merged {len(matched_examples_t)} examples from {len(in_sets)} datasets",
        f"Created translated and merged dataset '{out_set}'",
    )
Exemple #4
0
def pipe(source=None, api=None, loader=None, from_dataset=False, exclude=None):
    """
    Load examples from an input source, and print them as newline-delimited
    JSON. This makes it easy to filter the stream with command-line utilities
    such as `grep`. It's also often useful to inspect the stream, by piping to
    `less`.
    """
    DB = connect()
    if from_dataset:
        stream = DB.get_dataset(source)
    else:
        stream = get_stream(source, api, loader)
        stream = (set_hashes(eg) for eg in stream)
    if exclude:
        log("RECIPE: Excluding tasks from datasets: {}".format(
            ', '.join(exclude)))
        exclude_hashes = DB.get_input_hashes(*exclude)
        stream = filter_inputs(stream, exclude_hashes)
    try:
        for eg in stream:
            print(ujson.dumps(eg, escape_forward_slashes=False))
    except KeyboardInterrupt:
        pass
Exemple #5
0
def to_prodigy(
    examples: List[Example],
    prodigy_dataset: str,
    overwrite_dataset: bool = False,
    add_hash: bool = True,
) -> None:
    """Save a list of examples to Prodigy

    Args:
        examples (List[Example]): Input examples
        prodigy_dataset (str): Name of Prodigy dataset to save to
        overwrite_dataset (bool, optional): If True will overwrite all data in Prodigy

    Raises:
        ValueError: If trying to save examples to an existing dataset without explicitly
            setting overwrite_dataset to True
    """
    from prodigy.core import connect
    from prodigy.util import set_hashes

    db = connect()

    if db.get_dataset(prodigy_dataset):
        if overwrite_dataset:
            db.drop_dataset(prodigy_dataset)
            db.add_dataset(prodigy_dataset)
        else:
            raise ValueError(f"Prodigy dataset {prodigy_dataset} already exists.")
    else:
        db.add_dataset(prodigy_dataset)

    examples = [e.dict() for e in examples]
    prodigy_examples = []
    for e in examples:
        prodigy_examples.append(set_hashes(e))

    db.add_examples(prodigy_examples, [prodigy_dataset])
Exemple #6
0
def db_in(set_id,
          in_file,
          loader=None,
          answer='accept',
          overwrite=False,
          dry=False):
    """
    Import annotations to the database. Supports all formats loadable by
    Prodigy.
    """
    DB = connect()
    if not in_file.exists() or not in_file.is_file():
        prints("Not a valid input file.", in_file, exits=1, error=True)
    if set_id not in DB:
        prints("Can't find '{}' in database {}.".format(set_id, DB.db_name),
               "Maybe you misspelled the name or forgot to add the dataset "
               "using the `dataset` command?",
               exits=1,
               error=True)
    loader = get_loader(loader, file_path=in_file)
    annotations = loader(in_file)
    annotations = [set_hashes(eg) for eg in annotations]
    added_answers = 0
    for task in annotations:
        if 'answer' not in task or overwrite:
            task['answer'] = answer
            added_answers += 1
    session_id = get_timestamp_session_id()
    if not dry:
        DB.add_dataset(session_id, session=True)
        DB.add_examples(annotations, datasets=[set_id, session_id])
    prints(
        "Imported {} annotations for '{}' to database {}".format(
            len(annotations), set_id, DB.db_name),
        "Added '{}' answer to {} annotations".format(answer, added_answers),
        "Session ID: {}".format(session_id))
Exemple #7
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=100,
    batch_size=5,
    case_sensitive=False,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.

    If no similar terms are found above the given threshold, the threshold is
    lowered by 0.1 and similar terms are requested again.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>"
    accept_keys = []
    seen = set()
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1)
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        seen.add(best_word if case_sensitive else best_word.lower())
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0,
                "sense": best_sense
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept_keys = [
            eg["text"] for eg in prev if eg["answer"] == "accept"
        ]
        prev_words = [
            eg["word"] if case_sensitive else eg["word"].lower() for eg in prev
        ]
        accept_keys += prev_accept_keys
        seen.update(set(prev_words))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        nonlocal threshold
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            n_skipped = 0
            n_duplicate = 0
            for key, score in most_similar:
                if score > threshold:
                    word, sense = s2v.split_key(key)
                    if (case_sensitive
                            and word in seen) or (not case_sensitive
                                                  and word.lower() in seen):
                        n_duplicate += 1
                        continue
                    seen.add(word if case_sensitive else word.lower())
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score), "sense": sense}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }
                else:
                    n_skipped += 1
            if n_skipped:
                log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                    )
            if n_skipped == len(most_similar) - n_duplicate:
                # No most similar phrases were found that are above the
                # threshold, so lower the threshold if it's not already 0 or
                # return empty list so Prodigy shows "no tasks available"
                new_threshold = threshold - 0.1
                if new_threshold <= 0.0:
                    log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                    return []
                log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                    )
                threshold = new_threshold

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Exemple #8
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=20,
    batch_size=5,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>"
    accept_keys = []
    seen = set(accept_keys)
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            raise ValueError(f"Can't find seed term '{seed}' in vectors")
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        accept_keys += prev_accept
        seen.update(set(accept_keys))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            for key, score in most_similar:
                if key not in seen and score > threshold:
                    seen.add(key)
                    word, sense = s2v.split_key(key)
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score)}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Exemple #9
0
def teach(dataset,
          vectors_path,
          seeds,
          threshold=0.85,
          top_n=200,
          batch_size=5,
          resume=False):
    """
    Bootstrap a terminology list sense2vec. Prodigy
    will suggest similar terms based on the the most similar
    phrases from sense2vec
    """
    SENSES = [
        "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM",
        "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT",
        "WORK_OF_ART", "LANGUAGE"
    ]

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    LEMMATIZER = English().vocab.morphology.lemmatizer
    S2V = sense2vec.load(vectors_path)
    log("RECIPE: Finished loading sense2vec", locals())

    # Seems to be a bug in sense2vec which gets < n similar senses not <= n
    batch_size = min(batch_size, top_n * len(seeds))
    top_n = top_n + 1

    DB = connect()
    seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds]
    DB.add_examples(seed_tasks, datasets=[dataset])

    accept_phrases = seeds
    reject_phrases = []

    seen = set(accept_phrases)
    sensed = set()

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"]
        accept_phrases += prev_accept
        reject_phrases += prev_reject

        seen.update(set(accept_phrases))
        seen.update(set(reject_phrases))
        log("RECIPE: Resuming from {} previous examples in dataset {}".format(
            len(prev), dataset))

    def format_for_s2v(word, sense):
        return word.replace(" ", "_") + "|" + sense

    def get_best(word, sense):
        if sense != "auto":  # if sense is specified, find respective entry
            if format_for_s2v(word, sense) in S2V:
                return (word, sense)
            return (None, None)
        freqs = []
        casings = [word, word.upper(), word.title()
                   ] if word.islower() else [word]
        for text in casings:  # try options
            for tag in SENSES:
                query = format_for_s2v(text, tag)
                if query in S2V:
                    freqs.append((S2V[query][0], (text, tag)))
        return max(freqs)[1] if freqs else (None, None)

    def get_similar(word, sense, n=100):
        query = format_for_s2v(word, sense)
        if query not in S2V:
            return []
        freq, query_vector = S2V[query]
        words, scores = S2V.most_similar(query_vector, n)
        words = [word.rsplit("|", 1) for word in words]
        # Don't know why we'd be getting unsensed entries, but fix.
        words = [entry for entry in words if len(entry) == 2]
        words = [(word.replace("_", " "), sense) for word, sense in words]
        return zip(words, scores)

    def find_similar(word: str, sense: str = "auto", n_results: int = top_n):
        """Find similar terms for a given term and optional sense."""
        best_word, best_sense = get_best(word, sense)
        results = []
        if not word or not best_word:
            return results
        seen = set([best_word, min(LEMMATIZER(best_word, best_sense))])
        similar = get_similar(best_word, best_sense, n_results)
        for (word_entry, sense_entry), score in similar:
            head = min(LEMMATIZER(word_entry, sense_entry))
            if head not in seen and score > threshold:
                freq, _ = S2V[format_for_s2v(word_entry, sense_entry)]
                results.append((score, word_entry))
                seen.add(head)
            if len(results) >= n_results:
                break
        return results

    def update(answers):
        """Updates accept_phrases so that the stream can find new phrases"""
        for answer in answers:
            if answer['answer'] == 'accept':
                accept_phrases.append(answer['text'])
            elif answer['answer'] == 'reject':
                reject_phrases.append(answer['text'])

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and presenting
        examples to the user with a similarity above the threshold parameter"""
        while True:
            seen.update(set([rp.lower() for rp in reject_phrases]))
            for p in accept_phrases:
                if p.lower() not in sensed:
                    sensed.add(p.lower())
                    for score, phrase in find_similar(p):
                        if phrase.lower() not in seen:
                            seen.add(phrase.lower())
                            yield {"text": phrase, 'meta': {'score': score}}

    stream = get_stream()

    return {
        'view_id': 'text',
        'dataset': dataset,
        'stream': stream,
        'update': update,
        'config': {
            'batch_size': batch_size
        }
    }
Exemple #10
0
def phrases_teach(dataset, seeds, threshold=0.85, batch_size=5, resume=False):
    """
    Bootstrap a terminology list with word vectors and seeds terms. Prodigy
    will suggest similar terms based on the word vectors, and update the
    target vector accordingly.
    """

    DB = connect()
    seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds]
    DB.add_examples(seed_tasks, datasets=[dataset])

    accept_phrases = seeds
    reject_phrases = []

    seen = set(accept_phrases)
    sensed = set()

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"]
        accept_phrases += prev_accept
        reject_phrases += prev_reject

        seen.update(set(accept_phrases))
        seen.update(set(reject_phrases))

    def sense2vec(phrase, threshold):
        """Call sense2vec API to get similar "senses" (phrases)"""
        res = requests.post(API_URL, {
            "sense": "auto",
            "word": phrase
        })
        results = res.json()["results"]
        output = []
        for r in results:
            if r["score"] > threshold or len(output) <= 10:
                output.append((r["score"], r["text"]))

        return output

    def update(answers):
        """Updates accept_phrases so that the stream can find new phrases"""
        for answer in answers:
            if answer['answer'] == 'accept':
                accept_phrases.append(answer['text'])
            elif answer['answer'] == 'reject':
                reject_phrases.append(answer['text'])
    
    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and presenting
        examples to the user with a similarity above the threshold parameter"""
        while True:
            seen.update(set([rp.lower() for rp in reject_phrases]))
            for p in accept_phrases:
                if p.lower() not in sensed:
                    sensed.add(p.lower())
                    for score, phrase in sense2vec(p, threshold):
                        if phrase.lower() not in seen:
                            seen.add(phrase.lower())
                            yield score, {"text": phrase, 'meta': {'score': score}}

    stream = Probability(get_stream())

    return {
        'view_id': 'text',
        'dataset': dataset,
        'stream': stream,
        'update': update,
        'config': {
            "batch_size": batch_size
        }
    }
def terms_teach(dataset, vectors, seeds):
    """
    Bootstrap a terminology list with word vectors and seeds terms. Prodigy
    will suggest similar terms based on the word vectors, and update the
    target vector accordingly.
    """
    # Connect to the database using the settings from prodigy.json and add the
    # seed terms to the dataset
    DB = connect()
    if dataset and dataset in DB:
        seed_tasks = [set_hashes({'text': s, 'answer': 'accept'}) for s in seeds]
        DB.add_examples(seed_tasks, datasets=[dataset])

    # Load the spaCy model with vectors
    nlp = spacy.load(vectors)

    # Create two Doc objects for the accepted and rejected terms
    accept_doc = Doc(nlp.vocab, words=seeds)
    reject_doc = Doc(nlp.vocab, words=[])
    score = 0

    def predict(term):
        """Score a term given the current accept_doc and reject_doc."""
        if len(accept_doc) == 0 and len(reject_doc) == 0:
            return 0.5
        # Use spaCy's .similarity() method to compare the term to the
        # accepted and rejected Doc
        accept_score = max(term.similarity(accept_doc), 0.0)
        reject_score = max(term.similarity(reject_doc), 0.0)
        score = accept_score / (accept_score + reject_score + 0.2)
        return max(score, 0.0)

    def update(answers):
        # Called whenever Prodigy receives new annotations
        nonlocal accept_doc, reject_doc, score
        accept_words = [t.text for t in accept_doc]
        reject_words = [t.text for t in reject_doc]
        for answer in answers:
            # Increase or decrease score depending on answer and update
            # list of accepted and rejected terms
            if answer['answer'] == 'accept':
                score += 1
                accept_words.append(answer['text'])
            elif answer['answer'] == 'reject':
                score -= 1
                reject_words.append(answer['text'])
        # Update the target documents in place
        accept_doc = Doc(nlp.vocab, words=accept_words)
        reject_doc = Doc(nlp.vocab, words=reject_words)

    def score_stream(stream):
        # Get all lexemes in the vocab and score them
        lexemes = [lex for lex in stream if lex.is_alpha and lex.is_lower]
        while True:
            seen = set(w.orth for w in accept_doc)
            seen.update(set(w.orth for w in reject_doc))
            lexemes = [w for w in lexemes if w.orth not in seen]
            by_score = [(predict(lex), lex) for lex in lexemes]
            by_score.sort(reverse=True)
            for _, term in by_score:
                score = predict(term)
                # Return (score, example) tuples for the scored terms
                yield score, {'text': term.text, 'meta': {'score': score}}

    # Sort the scored vocab by probability and return examples
    stream = Probability(score_stream(nlp.vocab))

    return {
        'view_id': 'text',          # Annotation interface to use
        'dataset': dataset,         # Name of dataset to save annotations
        'stream': stream,           # Incoming stream of examples
        'update': update,           # Update callback, called with answers
    }