Beispiel #1
0
def test_gold_biluo_U(en_vocab):
    words = ["I", "flew", "to", "London", "."]
    spaces = [True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to London"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "U-LOC", "O"]
Beispiel #2
0
def test_gold_biluo_misalign(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley."]
    spaces = [True, True, True, True, True, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "-", "-", "-"]
def output_gold(nlp, testing_data):
    out = []

    for raw_text, entity_offsets in testing_data:
        doc = nlp.tokenizer(raw_text)
        gold = biluo_tags_from_offsets(doc, entity_offsets)
        out.append((doc, gold))
    return out
Beispiel #4
0
def test_gold_biluo_BIL(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"),
                 "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
Beispiel #5
0
def test_gold_biluo_misalign(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley."]
    spaces = [True, True, True, True, True, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    with pytest.warns(UserWarning):
        tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "-", "-", "-"]
Beispiel #6
0
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets
Beispiel #7
0
def test_gold_biluo_overlap(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"),
                 "LOC"),
                (len("I flew to "), len("I flew to San Francisco"), "LOC")]
    with pytest.raises(ValueError):
        tags = biluo_tags_from_offsets(doc, entities)
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets
Beispiel #9
0
    def prepare_sent(self, sent):
        sent = json.loads(sent)
        text, annotations = sent

        doc = self.nlp(text)
        ents = annotations['entities']
        repl = annotations['replacements']
        decls = filter_declarations(ents, get_declarations(text))

        tokens = [t.text for t in doc]
        ents_tags = biluo_tags_from_offsets(doc, ents)
        repl_tags = biluo_tags_from_offsets(doc, repl)
        decls = declarations_to_tags(doc, decls)

        fix_incorrect_tags(ents_tags)
        fix_incorrect_tags(repl_tags)

        assert len(tokens) == len(ents_tags) == len(repl_tags)

        repl_tags = [try_int(t.split("-")[-1]) for t in repl_tags]

        return tokens, ents_tags, repl_tags, decls
Beispiel #10
0
def convert():
    '''
    Gathers the data and formats it using BILUO, then wrhites it in CONLL format to a file.
    '''
    start = time.time()
    print('Loading spaCy...')
    nlp = spacy.load('en')
    end = time.time()
    print(end - start)
    print('Loading examples...')
    start = time.time()
    data = []
    examples = all_examples()
    i = 0
    last = 0
    count = len(examples)
    end = time.time()
    print(end - start)
    start = time.time()
    print('Converting', count, 'examples...')
    print('0% converted...')
    for example in examples:
        doc = nlp(example[0])
        data.append([[t.text for t in doc],
                     biluo_tags_from_offsets(doc, example[1]['entities'])])
        i += 1
        percent = int(i / count * 100)
        if percent != last:
            last = percent
            print(str(percent) + '% converted...')
    end = time.time()
    print(end - start)

    i = 0
    last = 0
    count = len(data)
    print('Saving Examples to CONLL...')
    print('0% written...')
    with open('models/train_data.conll', 'w') as f:
        for doc in [data]:
            for sentence, sent_entities in doc:
                f.write('-DOCSTART- -X- O O\n')
                i += 1
                percent = int(i / count * 100)
                if percent != last:
                    last = percent
                    print(percent, '% written...')
                for token, BIO_tag in zip(sentence, sent_entities):
                    f.write('{} -X- _ {}\n'.format(token, BIO_tag))
                f.write('\n')
    print('Export to CONLL Format Completed.')
def process_pair(pair, dataset_dir, label_dict):

    """
    Inputs:
    pair: (___.txt, ___.spans) tuple containing the filenames for each example.
    dataset_dir: str: which dataset directory the files live in.

    Outputs:
    formatted_lines: string containing the processed and formatted tokens and their
    corresponding labels.
    """

    pair_paths = os.path.join(dataset_dir, pair[0]), os.path.join(dataset_dir, pair[1])
    txt, spans = open_file(pair_paths[0]), open_file(pair_paths[1], form="lines")

    # Extract the tag type, index, end index (index + length), and entity
    span_lists = [l.split() for l in spans]
    span_tups = [(int(i[2]), int(i[2]) + int(i[3]), i[1]) for i in span_lists]

    # Convert the text to a spacy Doc (for compatibility with `biluo_tags_from_offsets`)
    nlp = spacy.load("xx_ent_wiki_sm")
    doc = nlp(txt, disable=["ner"])

    # Create the token-label pairs using `biluo_tags_from_offsets`
    tokens_biluo = list(zip(doc.doc, biluo_tags_from_offsets(doc, span_tups)))

    # Remove label prefixes and standardize label names (see LABEL_DICT at top)
    # `tokens_biluo` is a list of tuples, and tuples are immutable, so we need
    # to use a workaround
    tokens_biluo_temp = []
    for tup in tokens_biluo:
        if tup[1] != "O" and tup[1][2:] != "":
            new_lab = label_dict[tup[1][2:]]  # [0:2] tag prefix; [2:] tag body
            tokens_biluo_temp.append((tup[0], new_lab))
        else:
            tokens_biluo_temp.append((tup[0], tup[1]))

    # Spacy's tokenization is space-preserving, and this will cause
    # problems with the BERT model, so we replace those with standard newlines
    tokens_biluo = [
        tup if str(tup[0]).strip() != "" else "\n" for tup in tokens_biluo_temp
    ]

    # Format lines for writing out
    formatted_lines = ["\t".join(str(s) for s in tup) + "\n" for tup in tokens_biluo]
    for i, line in enumerate(formatted_lines):
        if line == ".\tO\n":  # Insert newlines after periods
            formatted_lines.insert(i + 1, "\n\n")

    return formatted_lines
Beispiel #12
0
def convert_bilou_with_missing_action(doc, offsets: list) -> list:
    """
    Convert unknown type token to missing value for NER
    Therefore no Loss will be applied to these tokens
    https://spacy.io/api/goldparse#biluo_tags_from_offsets
    :param doc: text tokenized by Spacy
    :param offsets: original offsets
    :return: list of BILOU types
    """
    result1 = biluo_tags_from_offsets(doc, offsets)
    return [
        no_action_bilou if unknown_type_name in action_bilou else action_bilou
        for action_bilou in result1
    ]
Beispiel #13
0
    def spacy_tok_ner(sent):
        doc = nlp(sent)
        j = doc.to_json()

        ranges = [(a["start"], a["end"]) for a in j["tokens"]]
        ents = j["ents"]

        tokens = []
        for range in ranges:
            tokens.append(sent[range[0]:range[1]])  # noqa

        entlocs = [(a["start"], a["end"], a["label"]) for a in ents]
        labels = biluo_tags_from_offsets(doc, entlocs)

        return tokens, labels
Beispiel #14
0
def docs_from_offsets(nlp, gold):
    """Create a sequence of Docs from a sequence of text, entity-offsets pairs."""
    docs = []
    for text, entities in gold:
        doc = nlp(text)
        entities = entities['entities']
        tags = biluo_tags_from_offsets(doc, entities)
        if entities:
            for start, end, label in entities:
                span = doc.char_span(start, end, label=label)
                if span:
                    doc.ents = list(doc.ents) + [span]
        if doc.ents:  # remove to return documents without entities too
            docs.append((doc, tags))
    return docs
def process_pair_ST(prefix):

    """
    Similar to process_pair for the factRuEval data, but with tweaks for
    the Shared Task 2019 data.
    """

    raw_path = f"../data/ru/shared_task_2019/raw/{prefix}.txt"
    ann_path = f"../data/ru/shared_task_2019/annotated/{prefix}.out"

    raw, objs = prep_st_data(raw_path, ann_path)
    ents = find_exact_matches(raw, objs)

    # Convert the text to a spacy Doc (for compatibility with `biluo_tags_from_offsets`)
    nlp = spacy.load("xx_ent_wiki_sm")
    doc = nlp(raw, disable=["ner"])

    # Create the token-label pairs using `biluo_tags_from_offsets`
    tokens_biluo = list(zip(doc.doc, biluo_tags_from_offsets(doc, ents)))

    # Remove prefixes ("B-", "I-", etc.) from labels
    # `tokens_biluo` is a list of tuples, and tuples are immutable, so we need
    # to use a workaround
    tokens_biluo_temp = []
    for tup in tokens_biluo:
        if tup[1] != "O":
            new_lab = tup[1][2:]
            tokens_biluo_temp.append((tup[0], new_lab))
        else:
            tokens_biluo_temp.append((tup[0], tup[1]))

    # Spacy's tokenization is space-preserving, and this will cause
    # problems with the BERT model, so we replace those with standard newlines
    tokens_biluo = [
        tup if str(tup[0]).strip() != "" else "\n" for tup in tokens_biluo_temp
    ]

    # Format lines for writing out:
    # Insert newlines to separate each sentence
    # Remove any leftover space artifacts from spacy processing
    formatted_lines = ["\t".join(str(s) for s in tup) + "\n" for tup in tokens_biluo]
    for i, line in enumerate(formatted_lines):
        if line == ".\tO\n":
            formatted_lines.insert(i + 1, "\n\n")
        elif line[0].isspace() and line != "\n\n":
            formatted_lines.remove(line)

    return formatted_lines
Beispiel #16
0
    def entities_to_biluo(self, doc, entities):
        """
        Converts entity span tuples into a suitable BILUO format for metrics.

        :param doc: spaCy doc of original text
        :param entities: Tuples to be converted

        :returns: List of new BILUO tags
        """
        spacy_biluo = biluo_tags_from_offsets(doc, entities)
        medacy_biluo = []
        for tag in spacy_biluo:
            if tag != 'O':
                tag = tag[2:]
            medacy_biluo.append(tag)
        return medacy_biluo
Beispiel #17
0
def check_ner():
    tagger = SequenceTagger.load('ner')
    sentence = Sentence('I love Berlin!')
    tagger.predict(sentence)
    print(sentence.to_tagged_string())

    TRAIN_DATA = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
        ("I like London and Berlin.", {"entities": [(7, 13, "LOCSEX"), (18, 24, "LOCSEX")]}),
    ]

    nlp = spacy.load('en_core_web_sm')
    docs = []
    for text, annot in TRAIN_DATA:
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        print("TAGS->>>>>>>>>>>..", tags)
Beispiel #18
0
def displacement_annotations_to_iob(sentence, annotations, nlp):
    doc = nlp.make_doc(sentence)
    tags = biluo_tags_from_offsets(doc, annotations)

    words = []
    slots = []
    for word, tag in zip(doc, tags):
        tag = re.sub(r'^U', "B", tag)
        tag = re.sub(r'^L', "I", tag)
        #this occurs when multiple spaces exist
        word = word.text.strip()
        # tokenization makes some word  like " ", removing them
        if word:
            words.append(word)
            slots.append(tag)

    return words, slots
Beispiel #19
0
def main(textfile, output, dummymodel, labellist):
    #Need a dummy model to create a nlp object with the aim to transform a txt file to json
    nlp = spacy.load(dummymodel)
    sr_transfrom = load_SRs_file(textfile)

    sr_transfrom_string = eval(spacy_format(sr_transfrom, labellist))
    docs = []
    for text, annot in sr_transfrom_string:
        doc = nlp(text)
        doc.is_parsed = True
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        entities = spans_from_biluo_tags(doc, tags)
        doc.ents = entities
        docs.append(doc)
    #Create the json file in the same directory that textfile

    mkdir_p(os.path.split(output)[0])
    srsly.write_json(output, [spacy.gold.docs_to_json(docs)])
Beispiel #20
0
    def _get_bilou_lines_for_entity(
        self, text: str, annotations: List[Dict[str, Any]], entity: str
    ) -> List[str]:
        """ The list of BILOU lines for entity

        Parameters
        ----------
        text : str
            The text for which BILOU lines need to be returned
        annotations : List[Dict[str, Any]]
            The list of annotations where every annotation is a dictionary
        entity : str
            A particular entity for which the BILOU lines are returned

        Returns
        -------
        List[str]
            The list of BILOU tagged lines, where every line is a ``word, tag, tag, tag`` where
            the tag is decided by the entity.

        """
        entities = []
        for annotation in annotations:
            start = annotation["start"]
            end = annotation["end"]
            tag = annotation["tag"]
            entities.append((start, end, tag))

        doc = self.nlp(text)
        tags = biluo_tags_from_offsets(doc, entities)
        tags = map(
            lambda tag: f"O-{entity}" if tag.startswith("O") or tag == "-" else tag,
            tags,
        )
        tags = list(tags)

        bilou_lines = []

        for token, tag in zip(doc, tags):
            if not token.is_space:
                bilou_line = f"{token.text}{self._conll_col_sep}{self._conll_col_sep.join([tag] * 3)}"
                bilou_lines.append(bilou_line)

        return bilou_lines
Beispiel #21
0
    def spaceeval_to_conll(self, spaceeval_xml_file: str, nlp: str):
        """
        Convert ISO-Space formatted file to CoNLL format.

        :param spaceeval_xml_file str: ISO-Space formatted XML file.
        :param nlp spacy.long.en.English: English SpaCy language model.
        """
        root = ElementTree.parse(spaceeval_xml_file).getroot()

        text: str = root.find('TEXT').text
        tags: List = list(root.find('TAGS'))

        offset = 0
        sent_tokens = []
        sent_ents = []
        for sent in text.split('. '):
            sent = sent + '. '

            # split sentences by newlines
            sent_nlp = nlp(sent)
            tokens = [str(token) for token in sent_nlp]
            spatial_entities = self.extract_labels(tags, sent, offset)

            ent_biluo = biluo_tags_from_offsets(sent_nlp, spatial_entities)

            # allennlp cant handle unknown tags so just use other
            ent_biluo = ['O' if x == '-' else x for x in ent_biluo]

            sent_tokens.extend(tokens)
            sent_tokens.append('')
            sent_ents.extend(ent_biluo)
            sent_ents.append('\n')
            offset += len(sent)

        file_conll = list(zip(sent_tokens, sent_ents))

        for pair in file_conll:
            if '\n' in pair[0] or '\u2002' in pair[0] or ' ' in pair[0]:
                file_conll.remove(pair)
        return file_conll
Beispiel #22
0
    def token_annotations(self, doc, tag_blind=False, entity_tag=ENTITY_TAG):
        parsed = self.tokenize(doc.text, disable=("tagger", "parser", "ner"))
        entities = [(int(ann.start), int(ann.end), ann.tag)
                    for ann in doc.annotations]
        biluo_tags = biluo_tags_from_offsets(parsed, entities)

        tags = []
        for tag in biluo_tags:
            if tag == "O":
                tags.append('O')
            elif tag == '-':
                # Returned by spacy if token boundaries mismatch entity boundaries.
                # These errors are ignored.
                #
                # https://spacy.io/api/goldparse#biluo_tags_from_offsets
                tags.append('O')
            elif tag_blind:
                tags.append(entity_tag)
            else:
                tags.append(tag[2:])

        return tags
Beispiel #23
0
def ls_to_spacy_json(ls_completions):
    nlp = spacy.load('en_core_web_sm')

    # Load the Label Studio completions
    with ZipFile(ls_completions, 'r') as zip:
        result_file = zip.read('result.json')
        label_studio_json = json.loads(result_file)

    gold_docs = []
    entity_cnt = 0
    for task in label_studio_json:
        completions = task['completions']

        # don't include skipped tasks or tasks with multiple completions
        if len(completions) == 1:
            completion = completions[0]
            if 'was_cancelled' in completion:
                continue

            raw_text = task['data']['reddit']
            annotated_entities = []
            for result in completion['result']:
                ent = result['value']
                start_char_offset = ent['start']
                end_char_offset = ent['end']
                ent_label = ent['labels'][0]
                entity = (start_char_offset, end_char_offset, ent_label)
                annotated_entities.append(entity)

            doc = nlp(raw_text)
            tags = biluo_tags_from_offsets(doc, annotated_entities)
            entities = spans_from_biluo_tags(doc, tags)
            doc.ents = entities
            gold_docs.append(doc)
            entity_cnt += len(annotated_entities)

    print("{} entities in {} docs.".format(str(entity_cnt), len(gold_docs)))
    return gold_docs
Beispiel #24
0
def _doc_to_bio(parsed_doc: spacy.tokens.Doc, annotations: List[Annotation]):
    entities = [(int(ann.start), int(ann.end), ann.tag) for ann in annotations]
    biluo_tags = biluo_tags_from_offsets(parsed_doc, entities)

    biluo_to_bio = {
        'B-': 'B-',
        'I-': 'I-',
        'L-': 'I-',
        'U-': 'B-',
    }

    tags = []
    for tag in biluo_tags:
        if tag == "O":
            tags.append('O')
        elif tag == '-':
            # Returned by spacy if token boundaries mismatch entity boundaries.
            # https://spacy.io/api/goldparse#biluo_tags_from_offsets
            tags.append('O')
        else:
            tags.append(biluo_to_bio[tag[0:2]] + tag[2:])

    return tags
def convertDataToLstm(DATA, SLOTS_INFO, IDS, train):
    prefix_re = re.compile(r'''^[[("']''')
    suffix_re = re.compile(r'''[])"']$''')
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')

    # simple_url_re = re.compile(r'''[a-zA-Z0-9]/+''')

    def create_tokenizer(nlp):
        return Tokenizer(nlp.vocab,
                         rules={},
                         prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer
                         )

    nlp.tokenizer = create_tokenizer(nlp)

    docs = []
    for j, (text, annot) in enumerate(DATA):
        doc_things = []
        tokens = []
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        tags = getNewTags(tags)
        for i, tag in enumerate(tags):
            if (tag == "-"):
                for slot in SLOTS_INFO[j]:
                    if slot["slotValue"] in tokens[i]:
                        tags[i] = slot["slotName"]
                        break
        for i, token in enumerate(doc):
            doc_things.append((token.text, token.pos_, tags[i]))

        docs.append(doc_things)

    print(docs)
def char_offset_to_token_offset_df(data_df):
    counter = 0
    for row in data_df.iterrows():
        index = row[0]
        paragraph = row[1][1]
        span = row[1][2]
        start = row[1][3]
        end = row[1][4]
        # span = paragraph[start:end]

        doc = nlp(paragraph)

        entities = [(start, end, "ANSWER")]

        tags = biluo_tags_from_offsets(doc, entities)

        try:
            if "U-ANSWER" in tags:
                start_tok_idx = tags.index('U-ANSWER')
                end_tok_idx = start_tok_idx
            elif "B-ANSWER" in tags:
                start_tok_idx = tags.index('B-ANSWER')
                end_tok_idx = tags.index('L-ANSWER')
            else:
                continue
            data_df.iloc[
                index, data_df.columns.get_loc('start_token')] = start_tok_idx
            data_df.iloc[index,
                         data_df.columns.get_loc('end_token')] = end_tok_idx
            counter += 1

            result_span = doc[start_tok_idx:end_tok_idx + 1]
            assert span == str(result_span)
        except Exception as AssertionError:
            continue
    return data_df
Beispiel #27
0
def main(fname, label, model, debug=False):
    level = logging.DEBUG if debug else logging.WARNING
    logging.basicConfig(level=level, format='%(message)s')

    print("Loading model '%s' ... " % model)
    nlp = spacy.load(model)

    _words = [
        'horse',
    ]
    _label = label

    # open input file
    PWD = os.path.dirname(__file__)
    _fname = os.path.join(PWD, fname)
    print('reading from {} ...'.format(_fname))
    lines = []
    with open(_fname) as f_in:
        for line in f_in:
            # skip irrelevant lines
            if len(line) < 10:
                continue
            lines.append(line)

    # shuffle
    random.shuffle(lines)

    # dev/train split
    dev_length = len(lines) // 4
    split_list = [
        (lines[:dev_length], 'dev'),
        (lines[dev_length:], 'train'),
    ]

    # create output file (json-input-format)
    for lines, split_name in split_list:
        fname_out = '{}.{}.json'.format(_fname, split_name)
        print('generating spacy json-input-format: {} ...'.format(fname_out))
        with open(fname_out, 'w') as f_out:
            # start json-input-format
            f_out.write(u'[\n')

            # convert input - line by line
            id = 0  # incremental doc-id
            for line in lines:

                # line clenup
                sentence = line.strip('\r\n')

                # process sentence
                id += 1
                doc = nlp(sentence)

                # perpare BILUO tags
                entities = []
                for t in doc:
                    offset = t.idx
                    length = len(t.orth_)
                    if (t.orth_ in _words or t.lemma_ in _words):
                        entities.append((offset, offset + length, _label))
                    elif t.ent_type:
                        entities.append((offset, offset + length, t.ent_type_))
                biluo_tags = biluo_tags_from_offsets(doc, entities)

                # write json-input-format

                # open doc
                if (id > 1):
                    f_out.write(u'\t,{\n')
                else:
                    f_out.write(u'\t{\n')

                # ID of the document within the corpus
                f_out.write(u'\t\t"id": {},\n'.format(id))

                # list of paragraphs in the corpus
                f_out.write(u'\t\t"paragraphs": [{\n')

                # raw text of the paragraph
                f_out.write(u'\t\t\t"raw": "{}",\n'.format(escape(sentence)))

                # list of sentences in the paragraph
                f_out.write(u'\t\t\t"sentences": [{\n')

                # list of tokens in the sentence
                f_out.write(u'\t\t\t\t"tokens": [\n')

                for t in doc:
                    # start token
                    if (t.i > 0):
                        f_out.write(u'\t\t\t\t\t,{ ')
                    else:
                        f_out.write(u'\t\t\t\t\t { ')
                    # index of the token in the document
                    f_out.write(u'"id": {}, '.format(t.i))
                    # dependency label
                    f_out.write(u'"dep": "{}", '.format(t.dep_))
                    # offset of token head relative to token index
                    f_out.write(u'"head": {}, '.format(t.head.i - t.i))
                    # part-of-speech tag
                    f_out.write(u'"tag": "{}", '.format(t.tag_))
                    # verbatim text of the token
                    f_out.write(u'"orth": "{}", '.format(escape(t.orth_)))
                    # BILUO label, e.g. "O" or "U-ORG"
                    f_out.write(u'"ner": "{}" '.format(biluo_tags[t.i]))
                    # end token
                    f_out.write(u'}\n')  # without trailing ','

                # end tokens (sentence)
                f_out.write(u'\t\t\t\t]\n')

                # end sentences
                f_out.write(u'\t\t\t}]\n')

                # end paragraps
                f_out.write(u'\t\t}]\n')

                # end doc
                f_out.write(u'\t}\n')

            # end json-input-format
            f_out.write(u']\n')

    print('Done.')
Beispiel #28
0
def brat2spacy(tokenizer, ann, text):
    doc = tokenizer(text)
    words = [i.text for i in doc]
    entity_ids = defaultdict(tuple)
    relation_ids = defaultdict(tuple)
    entities = []
    for line in ann.strip().split('\n'):
        annotation = line.strip().rsplit('\t')
        id_ = annotation[0]
        if id_ == '*':
            ann_type = id_[0]
        else:
            ann_type = annotation_ids[id_[0]]
        if ann_type == 'entity':
            if len(annotation[1:]) == 2:
                span, surface_form = annotation[1:]
                entity_type, start, end = span.split(' ')
                entity_ids[id_] = (int(start), int(end))
                entities.append((int(start), int(end), entity_type))
        if ann_type == 'relation':
            if len(annotation[1:]) == 1:
                rel_type, head, dep = annotation[1].split(' ')
                relation_ids[id_] = (rel_type, head, dep)
    entities.sort(key=lambda x: x[0])
    tags = biluo_tags_from_offsets(doc, entities)
    if relation_ids:
        # mapping from brat ids to doc's id
        brat_doc_ids_map = {}
        for entity in entity_ids:
            span = doc.char_span(*entity_ids[entity])
            if span.end - span.start == 1:
                brat_doc_ids_map[entity] = span.start
            else:
                # raise Warning("Tokenization mismatch, more than 1 spaCy token in ann token span")
                brat_doc_ids_map[entity] = span.start
        ids = range(len(doc))
        heads = defaultdict(int)
        deps = defaultdict(int)
        for rel_id, rel in relation_ids.items():
            dep, token, head = rel
            token, head = brat_doc_ids_map[token.split(
                ':')[1]], brat_doc_ids_map[head.split(':')[1]]
            heads[head] = token
            deps[head] = dep
        heads = [
            i[1] if i[1] > 0 else i[0] for i in [(i, heads[i]) for i in ids]
        ]
        deps = [
            i[1] if i[1] != 0 else 'ROOT' for i in [(i, deps[i]) for i in ids]
        ]
        assert len(words) == len(heads) == len(deps) == len(tags)
        return GoldParse(doc,
                         words=words,
                         heads=heads,
                         tags=tags,
                         deps=deps,
                         entities=entities), text
    else:
        assert len(words) == len(tags)
        return GoldParse(doc,
                         words=words,
                         tags=tags,
                         entities=offsets_from_biluo_tags(doc, tags)), text
Beispiel #29
0
                    ne = NE_njkp_to_spacy[ne]
                    nes += [(len(text)-1-len(orth), len(text)-1, ne)]

                token['ctag'] = ctag
                token['orth'] = orth
                token['head'] = 0  # @TODO
                token['dep'] = 'NA'  # @TODO
                token['id'] = token_idx
                token['ner'] = ne
                token_idx += 1
                sentence_json += [token]
            sentences += [sentence_json]

        doc = nlp(text)
        entities = nes
        biluo_tags = biluo_tags_from_offsets(doc, entities)

        sentences = set_biluo_tags(sentences, biluo_tags)
        paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences]
        paragraph_json['raw'] = pg_text
        paragraphs += [paragraph_json]

    doc_json['id'] = doc_id
    doc_json['paragraphs'] = paragraphs

    doc_id += 1
    corpus += [doc_json]

with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
    json.dump(corpus, f)
    def create_ner_dataset(
        data,
        tokenizer: Union[BertTokenizer, AlbertTokenizer],
        save_directory=None,
        max_sequence_length=512,
        conll_format=False,
    ) -> Tuple[Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor],
               torch.LongTensor, List[str]]:
        """
            Given a list of tuples of document with span level annotations, saves bert input and labels onto disk.
            This method is designed as a pre-processing step to be utilized with a pytorch Dataset and Dataloader.

            :param data:  a list of tuples relating a document to its set of annotations.
            :param tokenizer: the transformers tokenizer to utilize.
            :param conll_format: set true if data is a tuple containing parallel arrays of tokens and labels and list of entities
            :return the location the dataset was saved
            """
        # TODO insure sequences are not split on token boundaries.
        if conll_format:
            assert len(
                data
            ) == 3, "Should contain list of tokens, tags and list of bilou entities"
            token_sequences = []
            label_sequences = []
            token_sequence = data[0]
            label_sequence = data[1]
            token_sequences.append(token_sequence)
            label_sequences.append(label_sequence)
            biluo_ordered_labels = sorted([
                entity_label for entity_label in data[2] if entity_label != 'O'
            ] + ['O', 'BERT_TOKEN'])
            tags_from_annotations = biluo_ordered_labels

        else:  #custom spacy format
            assert len(data) > 1
            assert 'entities' in data[0][1]
            assert 'entity_labels' in data[0][1]
            token_sequences = []
            label_sequences = []

            entity_labels = set()
            tags_from_annotations = set()

            for doc, annotations in data:
                for label in annotations['entity_labels']:
                    entity_labels.add(label)
                offsets = [
                    offset for annotation in annotations['entities'].values()
                    for offset in annotation
                ]
                tags = biluo_tags_from_offsets(doc, offsets)
                for tag in tags:
                    tags_from_annotations.add(tag)

                token_sequences.append([x for x in doc])
                label_sequences.append(tags)

            biluo_ordered_labels = sorted([
                f"{prefix}-{entity_label}" for prefix in ['B', 'I', 'L', 'U']
                for entity_label in entity_labels if entity_label != 'O'
            ] + ['O', 'BERT_TOKEN'])
            tags_from_annotations = sorted(
                list(tags_from_annotations) + ['BERT_TOKEN'])

        # convert each string label to a unique id with respect to the biluo_labels of the tokenization
        encoded_label_sequences = [[
            biluo_ordered_labels.index(label) for label in seq
        ] for seq in label_sequences]

        class_counts = [0] * len(biluo_ordered_labels)

        for seq in encoded_label_sequences:
            for id in seq:
                class_counts[id] += 1

        class_counts = torch.FloatTensor(class_counts)
        loss_weights = torch.abs(
            1 - (class_counts /
                 len([x for x in seq for seq in encoded_label_sequences])))
        # Assert that all labels appear in the annotations. This could occur if annotation processing could not align
        # all annotations into the defined spacy tokenization.
        if biluo_ordered_labels != tags_from_annotations:
            warnings.warn(
                "Processed dataset does not contain instances from all labels when converted to BILOU scheme."
            )

        # Now generate bert input tensors
        all_bert_sequence_alignments, all_bert_subword_sequences, all_bert_label_sequences, original_tokenization_labels = [], [], [], []

        for sequence, labels in zip(token_sequences, encoded_label_sequences):

            # alignment from the bert tokenization to spaCy tokenization
            assert len(sequence) == len(labels)

            #maps each original token to it's subwords
            token_idx_to_subwords = []
            for token in sequence:
                token_idx_to_subwords.append(
                    [subword for subword in tokenizer.tokenize(str(token))])

            #token_idx_to_subwords = [seq for seq in token_idx_to_subwords if seq]
            bert_subwords = ['[CLS]', '[SEP]']
            bert_subword_labels = [
                biluo_ordered_labels.index('BERT_TOKEN'),
                biluo_ordered_labels.index('BERT_TOKEN')
            ]
            bert_subword_to_original_tokenization_alignment = [-1, -1]
            original_tokens_processed = []

            # print(token_idx_to_subwords[:10])
            # print([str(token) for token in sequence][:10])
            # exit()
            idx = 0
            chunk_start = 0
            while idx < len(sequence):

                start_next_buffer = False
                token_in_buffer_size = len(bert_subwords) + len(
                    token_idx_to_subwords[idx]) <= max_sequence_length

                if token_in_buffer_size:
                    #build a sequence
                    bert_subwords[-1:-1] = [
                        subword for subword in token_idx_to_subwords[idx]
                    ]
                    bert_subword_labels[-1:-1] = [
                        labels[idx] for _ in token_idx_to_subwords[idx]
                    ]
                    bert_subword_to_original_tokenization_alignment[-1:-1] = [
                        idx - chunk_start for _ in token_idx_to_subwords[idx]
                    ]
                    original_tokens_processed.append(idx)
                    idx += 1

                #Insure we aren't splitting on a label by greedily splitting on 'O' labels once the buffer gets very full (>500 subwords)
                if len(bert_subwords) > 500 and labels[
                        idx - 1] == biluo_ordered_labels.index('O'):
                    start_next_buffer = True

                if not token_in_buffer_size or start_next_buffer:
                    all_bert_subword_sequences.append(bert_subwords)
                    all_bert_label_sequences.append(bert_subword_labels)
                    all_bert_sequence_alignments.append(
                        bert_subword_to_original_tokenization_alignment)

                    original_tokenization_labels.append(
                        [labels[i] for i in original_tokens_processed])

                    #reset sequence builders
                    bert_subwords = ['[CLS]', '[SEP]']
                    bert_subword_labels = [
                        biluo_ordered_labels.index('BERT_TOKEN'),
                        biluo_ordered_labels.index('BERT_TOKEN')
                    ]
                    bert_subword_to_original_tokenization_alignment = [-1, -1]
                    original_tokens_processed = []
                    chunk_start = idx

            if bert_subwords != ['[CLS]', '[SEP]']:
                #Add the remaining
                all_bert_subword_sequences.append(bert_subwords)
                all_bert_label_sequences.append(bert_subword_labels)
                all_bert_sequence_alignments.append(
                    bert_subword_to_original_tokenization_alignment)
                original_tokenization_labels.append(
                    [labels[i] for i in original_tokens_processed])

        for seq in original_tokenization_labels:
            for label in seq:
                assert label != -1

        max_num_spacy_labels = max(
            [len(seq) for seq in original_tokenization_labels])

        bert_input_ids = torch.zeros(size=(len(all_bert_subword_sequences),
                                           max_sequence_length),
                                     dtype=torch.long)
        bert_attention_masks = torch.zeros_like(bert_input_ids)
        bert_sequence_lengths = torch.zeros(
            size=(len(all_bert_subword_sequences), 1))

        bert_labels = torch.zeros_like(bert_input_ids)
        bert_alignment = torch.zeros_like(bert_input_ids)
        gold_original_token_labels = torch.zeros(
            size=(len(all_bert_subword_sequences), max_num_spacy_labels),
            dtype=torch.long)

        for idx, (bert_subword_sequence, bert_label_sequence, alignment, original_tokenization_label) \
                in enumerate(zip(all_bert_subword_sequences, all_bert_label_sequences, all_bert_sequence_alignments, original_tokenization_labels)):
            if len(bert_subword_sequence) > 512:
                raise BaseException(
                    "Error sequence at index %i as it is to long (%i tokens)" %
                    (idx, len(bert_subword_sequence)))
            input_ids = tokenizer.convert_tokens_to_ids(bert_subword_sequence)
            attention_masks = [1] * len(input_ids)

            while len(
                    input_ids
            ) < max_sequence_length:  #pad bert aligned input until max length
                input_ids.append(0)
                attention_masks.append(0)
                bert_label_sequence.append(0)
                alignment.append(-1)
            while len(
                    original_tokenization_label
            ) < max_num_spacy_labels:  #pad spacy aligned input with -1
                original_tokenization_label.append(-1)

            bert_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long)
            bert_attention_masks[idx] = torch.tensor(attention_masks,
                                                     dtype=torch.long)
            bert_alignment[idx] = torch.tensor(alignment, dtype=torch.long)
            bert_sequence_lengths[idx] = torch.tensor(sum(
                [1 for x in input_ids if x != 0]),
                                                      dtype=torch.long)
            gold_original_token_labels[idx] = torch.tensor(
                original_tokenization_label, dtype=torch.long)
            bert_labels[idx] = torch.tensor(bert_label_sequence,
                                            dtype=torch.long)

            for i in range(1, len(bert_labels[idx]) - 1):
                # print()
                # print(f"Bert Labels | {i} | {bert_labels[idx][i]}")
                # print(f"Correct Original Labels | {i} | {gold_original_token_labels[idx][bert_alignment[idx][i]]}")
                # print(f"Bert Labels: {bert_labels[idx]}")
                # print(f"Spacy Labels: {gold_original_token_labels[idx]}")
                # print(f"Bert Alignment: {bert_alignment[idx]}")
                try:
                    assert bert_labels[idx][i] == gold_original_token_labels[
                        idx][bert_alignment[idx][i]]
                except BaseException:
                    pass

        if save_directory:
            torch.save(bert_input_ids,
                       os.path.join(save_directory,
                                    f"bert_input.pt"))  #bert input ids
            torch.save(
                bert_attention_masks,
                os.path.join(save_directory,
                             f"bert_attention_mask.pt"))  #bert attention masks
            torch.save(bert_sequence_lengths,
                       os.path.join(save_directory, f"bert_sequence_length.pt")
                       )  #length of actual bert sequence
            torch.save(bert_labels,
                       os.path.join(save_directory, f"bert_labels.pt")
                       )  #correct labels relative to bert tokenization
            torch.save(gold_original_token_labels,
                       os.path.join(save_directory, f"spacy_labels.pt")
                       )  #correct labels relative to spacy tokenization
            torch.save(bert_alignment,
                       os.path.join(save_directory,
                                    f"subword_to_spacy_alignment.pt")
                       )  #alignment between bert and spacy sequences
            torch.save(biluo_ordered_labels,
                       os.path.join(save_directory,
                                    'entity_names.pl'))  #entity labels
            torch.save(loss_weights,
                       os.path.join(
                           save_directory,
                           'loss_weights.pt'))  #global entity class counts

        return (bert_input_ids, None, bert_attention_masks), bert_sequence_lengths, bert_labels, original_tokenization_labels, \
               bert_alignment, biluo_ordered_labels, loss_weights
def train_spacy_model(train_data,
                      test_data,
                      model,
                      output_dir=None,
                      n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = model

    ent_types = []
    for _, e in train_data:
        ee = [ent[2] for ent in e['entities']]
        ent_types += ee

    for text, ent in train_data:
        doc = nlp(text)
        entities = ent['entities']
        tags = biluo_tags_from_offsets(doc, entities)

        # # if "-" in tags:
        # print(text)
        # print(entities, tags)
        # for t in doc:
        #     print(t, tags[t.i])
        # print("\n\n\n")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        # if model is None:
        nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print(f"{itn}:")
            print("\tLosses", losses)
            score = evaluate(nlp, test_data)
            if not os.path.isdir("models"):
                os.mkdir("models")
            nlp.to_disk(os.path.join("models", f"model_{itn}"))
            print("\t", score)
Beispiel #32
0
            labels[split[0]] = [(int(split[1]), int(split[2]), split[3])]

nlp = spacy.load("en_core_web_sm")

ids = list(data.keys())

bio_tags = []
sentences = []

for id in ids:
    doc = nlp(data[id])
    offsets = []
    if id in labels.keys():
        offsets = labels[id]

    labs = biluo_tags_from_offsets(doc, offsets)

    for sent in doc.sents:
        s = []
        l = []
        contains_positive = False
        for word in sent:
            s.append(word.lower_)
            label = labs[word.i]
            if label == '-':
                l.append("O")
            else:
                l.append(labs[word.i])
            if labs[word.i] != 'O' and labs[word.i] != '-':
                contains_positive = True
        if len(s) > 150:
    return Tokenizer(nlp.vocab,
                                prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                )

def inject_tokenizer(nlp):
    nlp.tokenizer = custom_tokenizer(nlp)
    return nlp

nlp = inject_tokenizer(spacy.blank("en"))

annotations_path = sys.argv[1]

with open(annotations_path) as annotations:
    for line in annotations:
        entry = json.loads(line.strip())
        # json schema
        # {
        #     text: stores code
        #     ents: stores type annotations in spacy NER format
        #     cats: function return type (can have several if there are nested function definitions)
        #     docstrings: stores docstrings, for main and nested functions
        #     replacements: used for another project
        # }
        doc = nlp(entry['text']) # store
        tags = biluo_tags_from_offsets(doc, entry['ents'])
        for t, tag in zip(doc, tags):
            print(t.text, tag, sep="\t\t")
        print()