Beispiel #1
0
def ner_manual(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Mark spans manually by token. Requires only a tokenizer and no entity
    recognizer, and doesn't do any active learning.
    """
    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }
Beispiel #2
0
def image_caption_text_align(dataset: str, sourcefile: str):
    """Stream in images and corresponding text.
    """
    nlp = spacy.load("de_core_news_sm")

    stream = JSONL(sourcefile)
    stream = fetch_images(stream)
    stream = add_tokens(nlp, stream)

    blocks = [
        {"view_id": "image", "spans": []},
        {"view_id": "text_input",
            "field_id": "caption",
            "field_rows": 4,
            "field-autofocus": True},
        {"view_id": "ner_manual"}
    ]
    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "blocks",
        "config": {"blocks": blocks,
                   "lang": nlp.lang,
                   "labels": ["current image"]
                   }
    }
Beispiel #3
0
def ner_make_gold(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Create gold-standard data by correcting a model's predictions manually.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    # Add the entities predicted by the model to the tasks in the stream
    stream = make_tasks(nlp, stream, label)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }
Beispiel #4
0
def ner_make_gold(dataset, spacy_model, source, label=None, exclude=None):
    """
    Create gold-standard data by correcting a model's predictions manually.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    # Add the entities predicted by the model to the tasks in the stream
    stream = make_tasks(nlp, stream, label)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'exclude': exclude,  # List of dataset names to exclude
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label  # Selectable label options
        }
    }
Beispiel #5
0
def ner_manual(dataset, spacy_model, source, label=None, exclude=None):
    """
    Mark spans manually by token. Requires only a tokenizer and no entity
    recognizer, and doesn't do any active learning.
    """
    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'exclude': exclude,  # List of dataset names to exclude
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label  # Selectable label options
        }
    }
Beispiel #6
0
def ref_tagging_recipe(dataset, input_collection, output_collection, model_dir, labels, view_id="text", db_host="localhost", db_port=27017, dir='rtl', lang='he',train_on_input=1):
    my_db = MongoProdigyDBManager(output_collection, db_host, db_port)
    labels = labels.split(',')
    nlp, model_exists = load_model(model_dir, labels, lang)
    if not model_exists and train_on_input == 1:
        temp_stream = getattr(my_db.db, input_collection).find({}, {"_id": 0})
        train_model(nlp, temp_stream, model_dir)
    all_data = list(getattr(my_db.db, input_collection).find({}, {"_id": 0}))  # TODO loading all data into ram to avoid issues of cursor timing out
    stream = filter_existing_refs(all_data, my_db)
    # stream = split_sentences(nlp, all_data, min_length=200)
    stream = add_model_predictions(nlp, stream)
    stream = add_tokens(nlp, stream, skip=True)
    if view_id == "ner":
        stream = split_spans(stream)


    def update(annotations):
        prev_annotations = my_db.db.examples.find({}, {"_id": 0}).limit(1000).sort([("_id", -1)])
        all_annotations = list(prev_annotations) + list(annotations)
        losses = train_model(nlp, all_annotations, model_dir)
        return losses.get('ner', None)

    def progress(ctrl, update_return_value):
        return update_return_value
        #return ctrl.session_annotated / getattr(my_db.db, input_collection).count_documents({})

    return {
        "db": my_db,
        "dataset": dataset,
        "view_id": view_id,
        "stream": stream,
        "progress": progress,
        # "update": update,
        "config": {
            "labels": labels,
            "global_css": f"""
                [data-prodigy-view-id='{view_id}'] .prodigy-content {{
                    direction: {dir};
                    text-align: {'right' if dir == 'rtl' else 'left'};
                }}
            """,
            "javascript": """
            function scrollToFirstAnnotation() {
                var scrollableEl = document.getElementsByClassName('prodigy-annotator')[0];
                var markEl = document.getElementsByTagName('mark')[0];
                scrollableEl.scrollTop = markEl.offsetTop;
            }
            document.addEventListener('prodigymount', function(event) {
                scrollToFirstAnnotation();
            })
            document.addEventListener('prodigyanswer', function(event) {
                scrollToFirstAnnotation();
            })
            """
        }
    }
 def ask_questions(stream):
     for eg in stream.examples:
         eg['time_loaded'] = datetime.now().isoformat()
         eg['mongo_collection'] = api  # record where it came from
         # not serializeable
         eg['_id'] = str(eg['_id'])
         # add tokens. add_tokens expects a list...
         ts = add_tokens(nlp, [eg])
         #...and returns a generator
         eg = next(ts)
         yield eg
Beispiel #8
0
def custom_recipe(dataset, jsonl_file):

    stream = JSONL(jsonl_file)
    stream = get_stream(stream)
    stream = add_tokens(nlp, stream)
    blocks = [{"view_id": "html"}, {"view_id": "ner_manual"}]
    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "blocks",
        "config": {
            "labels": ["LABEL1", "LABEL2", "LABEL3"],
            "blocks": blocks
        }
    }
def ner_silver_to_gold(
    silver_dataset: str,
    gold_dataset: str,
    spacy_model: str,
    label: Optional[List[str]] = None,
):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if label is None:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe("ner")
        label = sorted(ner.labels)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": gold_dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }
Beispiel #10
0
def ner_silver_to_gold(silver_dataset, gold_dataset, spacy_model, label=[]):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if not label:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe('ner')
        moves = ner.move_names
        label = [
            move.split('-')[1] for move in moves
            if move[0] in ('B', 'I', 'L', 'U')
        ]
        label = sorted(set(label))

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': gold_dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'labels': label  # Selectable label options
        }
    }
def qa(dataset, spacy_model, source, label="answer_span"):
    # load the source dataset, made of samples containing question and text pairs
    stream = JSONL(source)
    # load a spaCy model
    nlp = spacy.load(spacy_model)
    # and use it to tokenize the text
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",
        "dataset": dataset,
        "stream": stream,
        "config": {
            "lang": nlp.lang,
            "label": label,
            "labels": label
        },
    }
Beispiel #12
0
def facts_annotation(language:str ):
  # define labels for annotation
  labels = ["Supports judgment","Opposes judgment", "Lower court"]


  if language != "test":
    # Load the spaCy model for tokenization.
    nlp = spacy.load("{}_core_news_sm".format(language))
    stream = JSONL("./datasets/annotation_input_set_{}.jsonl".format(language))
  else:
    nlp = spacy.load("de_core_news_sm")
    stream = JSONL("./datasets/annotation_input_set_de_ex.jsonl")

  dataset = "annotations_{}".format(language)
  port = ports[language]


  # Tokenize the incoming examples and add a "tokens" property to each
  # example. Also handles pre-defined selected spans. Tokenization allows
  # faster highlighting, because the selection can "snap" to token boundaries.
  # If `use_chars` is True, tokens are split into individual characters, which enables
  # character based selection as opposed to default token based selection.
  stream = add_tokens(nlp, stream, use_chars=None)
  return {
    "dataset": dataset ,# Name of dataset_scrc to save annotations
    "view_id": "blocks",
    "stream": stream,
    "config": {
      "port": port,
      "blocks": [
        {"view_id": "html",
         "html_template": "<p style='float:left'>{{file_number}}</p>"},
        {"view_id": "html", "html_template": "<h1 style='float:left'>{{header}} – Judgment: {{judgment}}</h2>"},
        {"view_id": "html",
         "html_template": "<h2 style='float:left'>Facts</h2><a style='float:right' href='{{link}}' target='_blank'>Go to the court ruling</a>"},
        {"view_id": "spans_manual", "lang": nlp.lang, "labels": labels},
        {"view_id": "text_input","field_label":"Annotator comment on this ruling", "field_placeholder": "Type here...","field_rows": 5},
      ]
    },

    }
def COVIDKeywordsAnnotation(
    dataset_name: Optional[str] = 'entries',
    dataset_file: Optional[str] = None,
    spacy_model: Optional[str] = 'en_core_web_sm',
    dataset_exclude: Optional[List[str]] = None,
):
    """
    keywords annotation recipe

    :param dataset_name:
    :param dataset_file:
    :param spacy_model:
    :param dataset_exclude:
    :return:
    """

    # TEXT_STREAM_PIPELINE is the global variable that you put all text processors in
    # in that way, other function outside could use the same processing pipeline for the same task
    global TEXT_STREAM_PIPELINE
    # MONGO_COL_NAME is the global variable recoding which mongo collection
    # you load data if you want to load paper by doi
    global MONGO_COL_NAME

    # change globale variable MONGO_COL_NAME for further use when loading paper by doi
    MONGO_COL_NAME = dataset_name

    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # get a text stream, which is a generator of [{'text': '', ...}]
    if dataset_file is None:
        if dataset_name in db.collection_names():
            stream = db_endless_sampling(dataset_name)
        else:
            raise ValueError(
                'Loading from database because dataset_file is not specified! '
                'However, collection {} does not exist!'.format(dataset_name))
    else:
        # Load the stream from a JSONL file and return a generator that yields a
        # dictionary for each example in the data.
        stream = JSONL(dataset_file)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    # stream = add_tokens(nlp, stream)
    TEXT_STREAM_PIPELINE.append(lambda x: add_tokens(nlp, x))

    # add keywords extraction to pipeline
    kw_extractor_1 = keywords_extraction.KeywordsExtractorRaKUn(
        name='RaKUn_0',
        distance_threshold=2,
        pair_diff_length=2,
        bigram_count_threhold=2,
        num_tokens=[1, 2, 3],
        max_similar=10,
        max_occurrence=3,
        score_threshold=None,
        use_longest_phrase=True,
        ignore_shorter_keywords=False,
    )
    # use_longest_phrase = True,
    TEXT_STREAM_PIPELINE.append(lambda x: stream_add_keywords_ML(
        x,
        kw_extractors=[
            kw_extractor_1,
        ],
        add_keywords_in_db=True,
    ))

    with open('keywords_annotation.html') as txt:
        template_text = txt.read()
    with open('keywords_annotation.js') as txt:
        script_text = txt.read()
    with open('custom_style.css') as txt:
        css_text = txt.read()

    # activate tasks
    TASK_DESCs = {
        'ner': 'highlight named entities',
        'textcat': 'select text categories',
        'summary': 'add text summary',
        'note': 'add text notes',
    }
    AVAILABLE_TASKS = set(TASK_DESCs.keys())
    all_task_blocks = []

    # add title blocks
    all_task_blocks.extend(get_paper_title_blocks())

    # add task desc blocks
    all_task_blocks.extend(
        get_task_desc_blocks([
            'mark whatever you think are keywords',
        ]))

    # add keywords ner blocks
    all_task_blocks.extend(get_ner_blocks(labels=['KEYWORD']))
    all_task_blocks.extend([
        {
            'view_id': 'html',
            'html_template': template_text,
        },
    ])

    # apply stream pipeline on text stream
    for stream_fun in TEXT_STREAM_PIPELINE:
        stream = stream_fun(stream)

    return {
        "view_id": "blocks",  # Annotation interface to use
        "dataset": dataset_name,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": dataset_exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            'blocks': all_task_blocks,
            "lang": nlp.lang,
            'javascript': script_text,     # custom js
            'global_css': None,     # custom css
            'instant_submit': True,
        },
    }
def COVIDBase(
    task_type: Optional[List[str]] = ['NER', 'TextCat', 'Summary', 'Note'],
    dataset_name: Optional[str] = 'entries',
    dataset_file: Optional[str] = None,
    ner_label: Optional[List[str]] = None,
    textcat_title: Optional[str] = None,
    textcat_label: Optional[List[str]] = None,
    disable_multiple_choice: bool = False,
    spacy_model: Optional[str] = 'en_core_web_sm',
    dataset_exclude: Optional[List[str]] = None,
):
    """
    The base recipe provides most common annotation tasks for COVID study.
    Besides, you can build more sophisticated recipe based on this and
    save some time building wheels.
    Currently, there are four default tasks you can choose among.
    You can specify `task-type` to select.
    The default value is `-task-type ner, textcat, summary, note`.
    1. NER (Named-Entity Recognition): mark the words or phrase in text with different labels.
        You can define the labels with `ner-label. E.g. `-ner-label vaccine,disease`
    2. TextCat (Text categorization): choose the categories the paragraph falls in.
        Multiple choice is enabled by default.
        You can define the labels with `textcat_label`. E.g. `-textcat-lebel mechanism,diagnostics`.
        The default value is eleven classes defined by our Expert, Kevin.
    3. Summary: Summary the paragraph with more consice sentences.
    4. Note: Add any note you like, such as summary, important points, critical parameters, etc.

    :param task_type:
    :param dataset_name:
    :param dataset_file:
    :param ner_label:
    :param textcat_title:
    :param textcat_label:
    :param disable_multiple_choice:
    :param spacy_model:
    :param dataset_exclude:
    :return:
    """

    # TEXT_STREAM_PIPELINE is the global variable that you put all text processors in
    # in that way, other function outside could use the same processing pipeline for the same task
    global TEXT_STREAM_PIPELINE
    # MONGO_COL_NAME is the global variable recoding which mongo collection
    # you load data if you want to load paper by doi
    global MONGO_COL_NAME

    # change globale variable MONGO_COL_NAME for further use when loading paper by doi
    MONGO_COL_NAME = dataset_name

    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # get a text stream, which is a generator of [{'text': '', ...}]
    if dataset_file is None:
        if dataset_name in db.collection_names():
            stream = db_endless_sampling(dataset_name)
        else:
            raise ValueError(
                'Loading from database because dataset_file is not specified! '
                'However, collection {} does not exist!'.format(dataset_name))
    else:
        # Load the stream from a JSONL file and return a generator that yields a
        # dictionary for each example in the data.
        stream = JSONL(dataset_file)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    # stream = add_tokens(nlp, stream)
    TEXT_STREAM_PIPELINE.append(lambda x: add_tokens(nlp, x))

    # custom html, javascript, css
    # If you want to design your own html, javascript, and css
    # load them from file as following and pass them to corresponding interface
    # In terms of html, you can create a block with `view_id` of `html`
    # and pass the `template_text` to `html_template`.
    # Then, add this block to all_task_blocks.
    # E.g:
    # all_task_blocks.append({
    #     'view_id': 'html',
    #     'html_template': template_text,
    # })
    # In terms of javascript, pass `script_text` to `javascript` when returning.
    # If you are familiar with React, you can modify bundle.js as a more flexible way.
    # E.g.:
    # return {
    #     ...
    #     "config": {  # Additional config settings, mostly for app UI
    #         ...
    #         'javascript': script_text,  # custom js
    #         ...
    #     },
    #     ...
    # }
    # In terms of css, it is a little bit tricky. There are two ways.
    # (1) add the css in index.html.
    # E.g.:
    # <head>
    #     ...
    #     <style>
    #         .prodigy-content {
    #             text-align: justify !important;
    #         }
    #     </style>
    #     ...
    # </head>
    # (2) Pass css_text to `global_css` when returning. However, this is slower than method (1).
    # E.g.:
    # return {
    #     ...
    #     "config": {  # Additional config settings, mostly for app UI
    #         ...
    #         'css_text': script_text,  # custom js
    #         ...
    #     },
    #     ...
    # }
    # with open('keywords_annotation.html') as txt:
    #     template_text = txt.read()
    # with open('keywords_annotation.js') as txt:
    #     script_text = txt.read()
    # with open('keywords_annotation.css') as txt:
    #     css_text = txt.read()

    # activate tasks
    TASK_DESCs = {
        'ner': 'highlight named entities',
        'textcat': 'select text categories',
        'summary': 'add text summary',
        'note': 'add text notes',
    }
    AVAILABLE_TASKS = set(TASK_DESCs.keys())
    task_type = [x.lower() for x in task_type]
    all_task_blocks = []
    text_showed = False
    if len(set(task_type) - AVAILABLE_TASKS) > 0:
        raise ValueError(
            'task_type {} not enabled. Available task types: {}'.format(
                task_type, AVAILABLE_TASKS))

    # add title blocks
    all_task_blocks.extend(get_paper_title_blocks())

    # add task desc blocks
    all_task_blocks.extend(
        get_task_desc_blocks([TASK_DESCs[t] for t in task_type]))

    if 'ner' in task_type:
        # add ner blocks
        all_task_blocks.extend(get_ner_blocks(labels=ner_label))
        text_showed = True
        textcat_title = None

    if 'textcat' in task_type:
        # add text categorization blocks
        TEXT_STREAM_PIPELINE.append(
            lambda x: stream_add_options(x, labels=textcat_label))
        all_task_blocks.extend(
            get_textcat_blocks(title=textcat_title, w_text=(not text_showed)))
        text_showed = True

    if 'summary' in task_type:
        # add summary blocks
        all_task_blocks.extend(get_summary_blocks(w_text=(not text_showed)))
        text_showed = True

    if 'note' in task_type:
        # add note blocks
        all_task_blocks.extend(get_note_blocks(w_text=(not text_showed)))
        text_showed = True

    # apply stream pipeline on text stream
    for stream_fun in TEXT_STREAM_PIPELINE:
        stream = stream_fun(stream)

    return {
        "view_id": "blocks",  # Annotation interface to use
        "dataset": dataset_name,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": dataset_exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            'blocks': all_task_blocks,
            "lang": nlp.lang,
            'javascript': None,  # custom js
            'global_css': None,  # custom css
            'instant_submit': True,
            'choice_style': 'single' if disable_multiple_choice else 'multiple',
        },
    }