def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to London"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "U-LOC", "O"]
def test_gold_biluo_misalign(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley."] spaces = [True, True, True, True, True, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"]
def output_gold(nlp, testing_data): out = [] for raw_text, entity_offsets in testing_data: doc = nlp.tokenizer(raw_text) gold = biluo_tags_from_offsets(doc, entity_offsets) out.append((doc, gold)) return out
def test_gold_biluo_BIL(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_misalign(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley."] spaces = [True, True, True, True, True, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] with pytest.warns(UserWarning): tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"]
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets
def test_gold_biluo_overlap(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, True] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), (len("I flew to "), len("I flew to San Francisco"), "LOC")] with pytest.raises(ValueError): tags = biluo_tags_from_offsets(doc, entities)
def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets
def prepare_sent(self, sent): sent = json.loads(sent) text, annotations = sent doc = self.nlp(text) ents = annotations['entities'] repl = annotations['replacements'] decls = filter_declarations(ents, get_declarations(text)) tokens = [t.text for t in doc] ents_tags = biluo_tags_from_offsets(doc, ents) repl_tags = biluo_tags_from_offsets(doc, repl) decls = declarations_to_tags(doc, decls) fix_incorrect_tags(ents_tags) fix_incorrect_tags(repl_tags) assert len(tokens) == len(ents_tags) == len(repl_tags) repl_tags = [try_int(t.split("-")[-1]) for t in repl_tags] return tokens, ents_tags, repl_tags, decls
def convert(): ''' Gathers the data and formats it using BILUO, then wrhites it in CONLL format to a file. ''' start = time.time() print('Loading spaCy...') nlp = spacy.load('en') end = time.time() print(end - start) print('Loading examples...') start = time.time() data = [] examples = all_examples() i = 0 last = 0 count = len(examples) end = time.time() print(end - start) start = time.time() print('Converting', count, 'examples...') print('0% converted...') for example in examples: doc = nlp(example[0]) data.append([[t.text for t in doc], biluo_tags_from_offsets(doc, example[1]['entities'])]) i += 1 percent = int(i / count * 100) if percent != last: last = percent print(str(percent) + '% converted...') end = time.time() print(end - start) i = 0 last = 0 count = len(data) print('Saving Examples to CONLL...') print('0% written...') with open('models/train_data.conll', 'w') as f: for doc in [data]: for sentence, sent_entities in doc: f.write('-DOCSTART- -X- O O\n') i += 1 percent = int(i / count * 100) if percent != last: last = percent print(percent, '% written...') for token, BIO_tag in zip(sentence, sent_entities): f.write('{} -X- _ {}\n'.format(token, BIO_tag)) f.write('\n') print('Export to CONLL Format Completed.')
def process_pair(pair, dataset_dir, label_dict): """ Inputs: pair: (___.txt, ___.spans) tuple containing the filenames for each example. dataset_dir: str: which dataset directory the files live in. Outputs: formatted_lines: string containing the processed and formatted tokens and their corresponding labels. """ pair_paths = os.path.join(dataset_dir, pair[0]), os.path.join(dataset_dir, pair[1]) txt, spans = open_file(pair_paths[0]), open_file(pair_paths[1], form="lines") # Extract the tag type, index, end index (index + length), and entity span_lists = [l.split() for l in spans] span_tups = [(int(i[2]), int(i[2]) + int(i[3]), i[1]) for i in span_lists] # Convert the text to a spacy Doc (for compatibility with `biluo_tags_from_offsets`) nlp = spacy.load("xx_ent_wiki_sm") doc = nlp(txt, disable=["ner"]) # Create the token-label pairs using `biluo_tags_from_offsets` tokens_biluo = list(zip(doc.doc, biluo_tags_from_offsets(doc, span_tups))) # Remove label prefixes and standardize label names (see LABEL_DICT at top) # `tokens_biluo` is a list of tuples, and tuples are immutable, so we need # to use a workaround tokens_biluo_temp = [] for tup in tokens_biluo: if tup[1] != "O" and tup[1][2:] != "": new_lab = label_dict[tup[1][2:]] # [0:2] tag prefix; [2:] tag body tokens_biluo_temp.append((tup[0], new_lab)) else: tokens_biluo_temp.append((tup[0], tup[1])) # Spacy's tokenization is space-preserving, and this will cause # problems with the BERT model, so we replace those with standard newlines tokens_biluo = [ tup if str(tup[0]).strip() != "" else "\n" for tup in tokens_biluo_temp ] # Format lines for writing out formatted_lines = ["\t".join(str(s) for s in tup) + "\n" for tup in tokens_biluo] for i, line in enumerate(formatted_lines): if line == ".\tO\n": # Insert newlines after periods formatted_lines.insert(i + 1, "\n\n") return formatted_lines
def convert_bilou_with_missing_action(doc, offsets: list) -> list: """ Convert unknown type token to missing value for NER Therefore no Loss will be applied to these tokens https://spacy.io/api/goldparse#biluo_tags_from_offsets :param doc: text tokenized by Spacy :param offsets: original offsets :return: list of BILOU types """ result1 = biluo_tags_from_offsets(doc, offsets) return [ no_action_bilou if unknown_type_name in action_bilou else action_bilou for action_bilou in result1 ]
def spacy_tok_ner(sent): doc = nlp(sent) j = doc.to_json() ranges = [(a["start"], a["end"]) for a in j["tokens"]] ents = j["ents"] tokens = [] for range in ranges: tokens.append(sent[range[0]:range[1]]) # noqa entlocs = [(a["start"], a["end"], a["label"]) for a in ents] labels = biluo_tags_from_offsets(doc, entlocs) return tokens, labels
def docs_from_offsets(nlp, gold): """Create a sequence of Docs from a sequence of text, entity-offsets pairs.""" docs = [] for text, entities in gold: doc = nlp(text) entities = entities['entities'] tags = biluo_tags_from_offsets(doc, entities) if entities: for start, end, label in entities: span = doc.char_span(start, end, label=label) if span: doc.ents = list(doc.ents) + [span] if doc.ents: # remove to return documents without entities too docs.append((doc, tags)) return docs
def process_pair_ST(prefix): """ Similar to process_pair for the factRuEval data, but with tweaks for the Shared Task 2019 data. """ raw_path = f"../data/ru/shared_task_2019/raw/{prefix}.txt" ann_path = f"../data/ru/shared_task_2019/annotated/{prefix}.out" raw, objs = prep_st_data(raw_path, ann_path) ents = find_exact_matches(raw, objs) # Convert the text to a spacy Doc (for compatibility with `biluo_tags_from_offsets`) nlp = spacy.load("xx_ent_wiki_sm") doc = nlp(raw, disable=["ner"]) # Create the token-label pairs using `biluo_tags_from_offsets` tokens_biluo = list(zip(doc.doc, biluo_tags_from_offsets(doc, ents))) # Remove prefixes ("B-", "I-", etc.) from labels # `tokens_biluo` is a list of tuples, and tuples are immutable, so we need # to use a workaround tokens_biluo_temp = [] for tup in tokens_biluo: if tup[1] != "O": new_lab = tup[1][2:] tokens_biluo_temp.append((tup[0], new_lab)) else: tokens_biluo_temp.append((tup[0], tup[1])) # Spacy's tokenization is space-preserving, and this will cause # problems with the BERT model, so we replace those with standard newlines tokens_biluo = [ tup if str(tup[0]).strip() != "" else "\n" for tup in tokens_biluo_temp ] # Format lines for writing out: # Insert newlines to separate each sentence # Remove any leftover space artifacts from spacy processing formatted_lines = ["\t".join(str(s) for s in tup) + "\n" for tup in tokens_biluo] for i, line in enumerate(formatted_lines): if line == ".\tO\n": formatted_lines.insert(i + 1, "\n\n") elif line[0].isspace() and line != "\n\n": formatted_lines.remove(line) return formatted_lines
def entities_to_biluo(self, doc, entities): """ Converts entity span tuples into a suitable BILUO format for metrics. :param doc: spaCy doc of original text :param entities: Tuples to be converted :returns: List of new BILUO tags """ spacy_biluo = biluo_tags_from_offsets(doc, entities) medacy_biluo = [] for tag in spacy_biluo: if tag != 'O': tag = tag[2:] medacy_biluo.append(tag) return medacy_biluo
def check_ner(): tagger = SequenceTagger.load('ner') sentence = Sentence('I love Berlin!') tagger.predict(sentence) print(sentence.to_tagged_string()) TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOCSEX"), (18, 24, "LOCSEX")]}), ] nlp = spacy.load('en_core_web_sm') docs = [] for text, annot in TRAIN_DATA: doc = nlp(text) tags = biluo_tags_from_offsets(doc, annot['entities']) print("TAGS->>>>>>>>>>>..", tags)
def displacement_annotations_to_iob(sentence, annotations, nlp): doc = nlp.make_doc(sentence) tags = biluo_tags_from_offsets(doc, annotations) words = [] slots = [] for word, tag in zip(doc, tags): tag = re.sub(r'^U', "B", tag) tag = re.sub(r'^L', "I", tag) #this occurs when multiple spaces exist word = word.text.strip() # tokenization makes some word like " ", removing them if word: words.append(word) slots.append(tag) return words, slots
def main(textfile, output, dummymodel, labellist): #Need a dummy model to create a nlp object with the aim to transform a txt file to json nlp = spacy.load(dummymodel) sr_transfrom = load_SRs_file(textfile) sr_transfrom_string = eval(spacy_format(sr_transfrom, labellist)) docs = [] for text, annot in sr_transfrom_string: doc = nlp(text) doc.is_parsed = True tags = biluo_tags_from_offsets(doc, annot['entities']) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities docs.append(doc) #Create the json file in the same directory that textfile mkdir_p(os.path.split(output)[0]) srsly.write_json(output, [spacy.gold.docs_to_json(docs)])
def _get_bilou_lines_for_entity( self, text: str, annotations: List[Dict[str, Any]], entity: str ) -> List[str]: """ The list of BILOU lines for entity Parameters ---------- text : str The text for which BILOU lines need to be returned annotations : List[Dict[str, Any]] The list of annotations where every annotation is a dictionary entity : str A particular entity for which the BILOU lines are returned Returns ------- List[str] The list of BILOU tagged lines, where every line is a ``word, tag, tag, tag`` where the tag is decided by the entity. """ entities = [] for annotation in annotations: start = annotation["start"] end = annotation["end"] tag = annotation["tag"] entities.append((start, end, tag)) doc = self.nlp(text) tags = biluo_tags_from_offsets(doc, entities) tags = map( lambda tag: f"O-{entity}" if tag.startswith("O") or tag == "-" else tag, tags, ) tags = list(tags) bilou_lines = [] for token, tag in zip(doc, tags): if not token.is_space: bilou_line = f"{token.text}{self._conll_col_sep}{self._conll_col_sep.join([tag] * 3)}" bilou_lines.append(bilou_line) return bilou_lines
def spaceeval_to_conll(self, spaceeval_xml_file: str, nlp: str): """ Convert ISO-Space formatted file to CoNLL format. :param spaceeval_xml_file str: ISO-Space formatted XML file. :param nlp spacy.long.en.English: English SpaCy language model. """ root = ElementTree.parse(spaceeval_xml_file).getroot() text: str = root.find('TEXT').text tags: List = list(root.find('TAGS')) offset = 0 sent_tokens = [] sent_ents = [] for sent in text.split('. '): sent = sent + '. ' # split sentences by newlines sent_nlp = nlp(sent) tokens = [str(token) for token in sent_nlp] spatial_entities = self.extract_labels(tags, sent, offset) ent_biluo = biluo_tags_from_offsets(sent_nlp, spatial_entities) # allennlp cant handle unknown tags so just use other ent_biluo = ['O' if x == '-' else x for x in ent_biluo] sent_tokens.extend(tokens) sent_tokens.append('') sent_ents.extend(ent_biluo) sent_ents.append('\n') offset += len(sent) file_conll = list(zip(sent_tokens, sent_ents)) for pair in file_conll: if '\n' in pair[0] or '\u2002' in pair[0] or ' ' in pair[0]: file_conll.remove(pair) return file_conll
def token_annotations(self, doc, tag_blind=False, entity_tag=ENTITY_TAG): parsed = self.tokenize(doc.text, disable=("tagger", "parser", "ner")) entities = [(int(ann.start), int(ann.end), ann.tag) for ann in doc.annotations] biluo_tags = biluo_tags_from_offsets(parsed, entities) tags = [] for tag in biluo_tags: if tag == "O": tags.append('O') elif tag == '-': # Returned by spacy if token boundaries mismatch entity boundaries. # These errors are ignored. # # https://spacy.io/api/goldparse#biluo_tags_from_offsets tags.append('O') elif tag_blind: tags.append(entity_tag) else: tags.append(tag[2:]) return tags
def ls_to_spacy_json(ls_completions): nlp = spacy.load('en_core_web_sm') # Load the Label Studio completions with ZipFile(ls_completions, 'r') as zip: result_file = zip.read('result.json') label_studio_json = json.loads(result_file) gold_docs = [] entity_cnt = 0 for task in label_studio_json: completions = task['completions'] # don't include skipped tasks or tasks with multiple completions if len(completions) == 1: completion = completions[0] if 'was_cancelled' in completion: continue raw_text = task['data']['reddit'] annotated_entities = [] for result in completion['result']: ent = result['value'] start_char_offset = ent['start'] end_char_offset = ent['end'] ent_label = ent['labels'][0] entity = (start_char_offset, end_char_offset, ent_label) annotated_entities.append(entity) doc = nlp(raw_text) tags = biluo_tags_from_offsets(doc, annotated_entities) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities gold_docs.append(doc) entity_cnt += len(annotated_entities) print("{} entities in {} docs.".format(str(entity_cnt), len(gold_docs))) return gold_docs
def _doc_to_bio(parsed_doc: spacy.tokens.Doc, annotations: List[Annotation]): entities = [(int(ann.start), int(ann.end), ann.tag) for ann in annotations] biluo_tags = biluo_tags_from_offsets(parsed_doc, entities) biluo_to_bio = { 'B-': 'B-', 'I-': 'I-', 'L-': 'I-', 'U-': 'B-', } tags = [] for tag in biluo_tags: if tag == "O": tags.append('O') elif tag == '-': # Returned by spacy if token boundaries mismatch entity boundaries. # https://spacy.io/api/goldparse#biluo_tags_from_offsets tags.append('O') else: tags.append(biluo_to_bio[tag[0:2]] + tag[2:]) return tags
def convertDataToLstm(DATA, SLOTS_INFO, IDS, train): prefix_re = re.compile(r'''^[[("']''') suffix_re = re.compile(r'''[])"']$''') infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') # simple_url_re = re.compile(r'''[a-zA-Z0-9]/+''') def create_tokenizer(nlp): return Tokenizer(nlp.vocab, rules={}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer ) nlp.tokenizer = create_tokenizer(nlp) docs = [] for j, (text, annot) in enumerate(DATA): doc_things = [] tokens = [] doc = nlp(text) tags = biluo_tags_from_offsets(doc, annot['entities']) tags = getNewTags(tags) for i, tag in enumerate(tags): if (tag == "-"): for slot in SLOTS_INFO[j]: if slot["slotValue"] in tokens[i]: tags[i] = slot["slotName"] break for i, token in enumerate(doc): doc_things.append((token.text, token.pos_, tags[i])) docs.append(doc_things) print(docs)
def char_offset_to_token_offset_df(data_df): counter = 0 for row in data_df.iterrows(): index = row[0] paragraph = row[1][1] span = row[1][2] start = row[1][3] end = row[1][4] # span = paragraph[start:end] doc = nlp(paragraph) entities = [(start, end, "ANSWER")] tags = biluo_tags_from_offsets(doc, entities) try: if "U-ANSWER" in tags: start_tok_idx = tags.index('U-ANSWER') end_tok_idx = start_tok_idx elif "B-ANSWER" in tags: start_tok_idx = tags.index('B-ANSWER') end_tok_idx = tags.index('L-ANSWER') else: continue data_df.iloc[ index, data_df.columns.get_loc('start_token')] = start_tok_idx data_df.iloc[index, data_df.columns.get_loc('end_token')] = end_tok_idx counter += 1 result_span = doc[start_tok_idx:end_tok_idx + 1] assert span == str(result_span) except Exception as AssertionError: continue return data_df
def main(fname, label, model, debug=False): level = logging.DEBUG if debug else logging.WARNING logging.basicConfig(level=level, format='%(message)s') print("Loading model '%s' ... " % model) nlp = spacy.load(model) _words = [ 'horse', ] _label = label # open input file PWD = os.path.dirname(__file__) _fname = os.path.join(PWD, fname) print('reading from {} ...'.format(_fname)) lines = [] with open(_fname) as f_in: for line in f_in: # skip irrelevant lines if len(line) < 10: continue lines.append(line) # shuffle random.shuffle(lines) # dev/train split dev_length = len(lines) // 4 split_list = [ (lines[:dev_length], 'dev'), (lines[dev_length:], 'train'), ] # create output file (json-input-format) for lines, split_name in split_list: fname_out = '{}.{}.json'.format(_fname, split_name) print('generating spacy json-input-format: {} ...'.format(fname_out)) with open(fname_out, 'w') as f_out: # start json-input-format f_out.write(u'[\n') # convert input - line by line id = 0 # incremental doc-id for line in lines: # line clenup sentence = line.strip('\r\n') # process sentence id += 1 doc = nlp(sentence) # perpare BILUO tags entities = [] for t in doc: offset = t.idx length = len(t.orth_) if (t.orth_ in _words or t.lemma_ in _words): entities.append((offset, offset + length, _label)) elif t.ent_type: entities.append((offset, offset + length, t.ent_type_)) biluo_tags = biluo_tags_from_offsets(doc, entities) # write json-input-format # open doc if (id > 1): f_out.write(u'\t,{\n') else: f_out.write(u'\t{\n') # ID of the document within the corpus f_out.write(u'\t\t"id": {},\n'.format(id)) # list of paragraphs in the corpus f_out.write(u'\t\t"paragraphs": [{\n') # raw text of the paragraph f_out.write(u'\t\t\t"raw": "{}",\n'.format(escape(sentence))) # list of sentences in the paragraph f_out.write(u'\t\t\t"sentences": [{\n') # list of tokens in the sentence f_out.write(u'\t\t\t\t"tokens": [\n') for t in doc: # start token if (t.i > 0): f_out.write(u'\t\t\t\t\t,{ ') else: f_out.write(u'\t\t\t\t\t { ') # index of the token in the document f_out.write(u'"id": {}, '.format(t.i)) # dependency label f_out.write(u'"dep": "{}", '.format(t.dep_)) # offset of token head relative to token index f_out.write(u'"head": {}, '.format(t.head.i - t.i)) # part-of-speech tag f_out.write(u'"tag": "{}", '.format(t.tag_)) # verbatim text of the token f_out.write(u'"orth": "{}", '.format(escape(t.orth_))) # BILUO label, e.g. "O" or "U-ORG" f_out.write(u'"ner": "{}" '.format(biluo_tags[t.i])) # end token f_out.write(u'}\n') # without trailing ',' # end tokens (sentence) f_out.write(u'\t\t\t\t]\n') # end sentences f_out.write(u'\t\t\t}]\n') # end paragraps f_out.write(u'\t\t}]\n') # end doc f_out.write(u'\t}\n') # end json-input-format f_out.write(u']\n') print('Done.')
def brat2spacy(tokenizer, ann, text): doc = tokenizer(text) words = [i.text for i in doc] entity_ids = defaultdict(tuple) relation_ids = defaultdict(tuple) entities = [] for line in ann.strip().split('\n'): annotation = line.strip().rsplit('\t') id_ = annotation[0] if id_ == '*': ann_type = id_[0] else: ann_type = annotation_ids[id_[0]] if ann_type == 'entity': if len(annotation[1:]) == 2: span, surface_form = annotation[1:] entity_type, start, end = span.split(' ') entity_ids[id_] = (int(start), int(end)) entities.append((int(start), int(end), entity_type)) if ann_type == 'relation': if len(annotation[1:]) == 1: rel_type, head, dep = annotation[1].split(' ') relation_ids[id_] = (rel_type, head, dep) entities.sort(key=lambda x: x[0]) tags = biluo_tags_from_offsets(doc, entities) if relation_ids: # mapping from brat ids to doc's id brat_doc_ids_map = {} for entity in entity_ids: span = doc.char_span(*entity_ids[entity]) if span.end - span.start == 1: brat_doc_ids_map[entity] = span.start else: # raise Warning("Tokenization mismatch, more than 1 spaCy token in ann token span") brat_doc_ids_map[entity] = span.start ids = range(len(doc)) heads = defaultdict(int) deps = defaultdict(int) for rel_id, rel in relation_ids.items(): dep, token, head = rel token, head = brat_doc_ids_map[token.split( ':')[1]], brat_doc_ids_map[head.split(':')[1]] heads[head] = token deps[head] = dep heads = [ i[1] if i[1] > 0 else i[0] for i in [(i, heads[i]) for i in ids] ] deps = [ i[1] if i[1] != 0 else 'ROOT' for i in [(i, deps[i]) for i in ids] ] assert len(words) == len(heads) == len(deps) == len(tags) return GoldParse(doc, words=words, heads=heads, tags=tags, deps=deps, entities=entities), text else: assert len(words) == len(tags) return GoldParse(doc, words=words, tags=tags, entities=offsets_from_biluo_tags(doc, tags)), text
ne = NE_njkp_to_spacy[ne] nes += [(len(text)-1-len(orth), len(text)-1, ne)] token['ctag'] = ctag token['orth'] = orth token['head'] = 0 # @TODO token['dep'] = 'NA' # @TODO token['id'] = token_idx token['ner'] = ne token_idx += 1 sentence_json += [token] sentences += [sentence_json] doc = nlp(text) entities = nes biluo_tags = biluo_tags_from_offsets(doc, entities) sentences = set_biluo_tags(sentences, biluo_tags) paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences] paragraph_json['raw'] = pg_text paragraphs += [paragraph_json] doc_json['id'] = doc_id doc_json['paragraphs'] = paragraphs doc_id += 1 corpus += [doc_json] with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: json.dump(corpus, f)
def create_ner_dataset( data, tokenizer: Union[BertTokenizer, AlbertTokenizer], save_directory=None, max_sequence_length=512, conll_format=False, ) -> Tuple[Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor], torch.LongTensor, List[str]]: """ Given a list of tuples of document with span level annotations, saves bert input and labels onto disk. This method is designed as a pre-processing step to be utilized with a pytorch Dataset and Dataloader. :param data: a list of tuples relating a document to its set of annotations. :param tokenizer: the transformers tokenizer to utilize. :param conll_format: set true if data is a tuple containing parallel arrays of tokens and labels and list of entities :return the location the dataset was saved """ # TODO insure sequences are not split on token boundaries. if conll_format: assert len( data ) == 3, "Should contain list of tokens, tags and list of bilou entities" token_sequences = [] label_sequences = [] token_sequence = data[0] label_sequence = data[1] token_sequences.append(token_sequence) label_sequences.append(label_sequence) biluo_ordered_labels = sorted([ entity_label for entity_label in data[2] if entity_label != 'O' ] + ['O', 'BERT_TOKEN']) tags_from_annotations = biluo_ordered_labels else: #custom spacy format assert len(data) > 1 assert 'entities' in data[0][1] assert 'entity_labels' in data[0][1] token_sequences = [] label_sequences = [] entity_labels = set() tags_from_annotations = set() for doc, annotations in data: for label in annotations['entity_labels']: entity_labels.add(label) offsets = [ offset for annotation in annotations['entities'].values() for offset in annotation ] tags = biluo_tags_from_offsets(doc, offsets) for tag in tags: tags_from_annotations.add(tag) token_sequences.append([x for x in doc]) label_sequences.append(tags) biluo_ordered_labels = sorted([ f"{prefix}-{entity_label}" for prefix in ['B', 'I', 'L', 'U'] for entity_label in entity_labels if entity_label != 'O' ] + ['O', 'BERT_TOKEN']) tags_from_annotations = sorted( list(tags_from_annotations) + ['BERT_TOKEN']) # convert each string label to a unique id with respect to the biluo_labels of the tokenization encoded_label_sequences = [[ biluo_ordered_labels.index(label) for label in seq ] for seq in label_sequences] class_counts = [0] * len(biluo_ordered_labels) for seq in encoded_label_sequences: for id in seq: class_counts[id] += 1 class_counts = torch.FloatTensor(class_counts) loss_weights = torch.abs( 1 - (class_counts / len([x for x in seq for seq in encoded_label_sequences]))) # Assert that all labels appear in the annotations. This could occur if annotation processing could not align # all annotations into the defined spacy tokenization. if biluo_ordered_labels != tags_from_annotations: warnings.warn( "Processed dataset does not contain instances from all labels when converted to BILOU scheme." ) # Now generate bert input tensors all_bert_sequence_alignments, all_bert_subword_sequences, all_bert_label_sequences, original_tokenization_labels = [], [], [], [] for sequence, labels in zip(token_sequences, encoded_label_sequences): # alignment from the bert tokenization to spaCy tokenization assert len(sequence) == len(labels) #maps each original token to it's subwords token_idx_to_subwords = [] for token in sequence: token_idx_to_subwords.append( [subword for subword in tokenizer.tokenize(str(token))]) #token_idx_to_subwords = [seq for seq in token_idx_to_subwords if seq] bert_subwords = ['[CLS]', '[SEP]'] bert_subword_labels = [ biluo_ordered_labels.index('BERT_TOKEN'), biluo_ordered_labels.index('BERT_TOKEN') ] bert_subword_to_original_tokenization_alignment = [-1, -1] original_tokens_processed = [] # print(token_idx_to_subwords[:10]) # print([str(token) for token in sequence][:10]) # exit() idx = 0 chunk_start = 0 while idx < len(sequence): start_next_buffer = False token_in_buffer_size = len(bert_subwords) + len( token_idx_to_subwords[idx]) <= max_sequence_length if token_in_buffer_size: #build a sequence bert_subwords[-1:-1] = [ subword for subword in token_idx_to_subwords[idx] ] bert_subword_labels[-1:-1] = [ labels[idx] for _ in token_idx_to_subwords[idx] ] bert_subword_to_original_tokenization_alignment[-1:-1] = [ idx - chunk_start for _ in token_idx_to_subwords[idx] ] original_tokens_processed.append(idx) idx += 1 #Insure we aren't splitting on a label by greedily splitting on 'O' labels once the buffer gets very full (>500 subwords) if len(bert_subwords) > 500 and labels[ idx - 1] == biluo_ordered_labels.index('O'): start_next_buffer = True if not token_in_buffer_size or start_next_buffer: all_bert_subword_sequences.append(bert_subwords) all_bert_label_sequences.append(bert_subword_labels) all_bert_sequence_alignments.append( bert_subword_to_original_tokenization_alignment) original_tokenization_labels.append( [labels[i] for i in original_tokens_processed]) #reset sequence builders bert_subwords = ['[CLS]', '[SEP]'] bert_subword_labels = [ biluo_ordered_labels.index('BERT_TOKEN'), biluo_ordered_labels.index('BERT_TOKEN') ] bert_subword_to_original_tokenization_alignment = [-1, -1] original_tokens_processed = [] chunk_start = idx if bert_subwords != ['[CLS]', '[SEP]']: #Add the remaining all_bert_subword_sequences.append(bert_subwords) all_bert_label_sequences.append(bert_subword_labels) all_bert_sequence_alignments.append( bert_subword_to_original_tokenization_alignment) original_tokenization_labels.append( [labels[i] for i in original_tokens_processed]) for seq in original_tokenization_labels: for label in seq: assert label != -1 max_num_spacy_labels = max( [len(seq) for seq in original_tokenization_labels]) bert_input_ids = torch.zeros(size=(len(all_bert_subword_sequences), max_sequence_length), dtype=torch.long) bert_attention_masks = torch.zeros_like(bert_input_ids) bert_sequence_lengths = torch.zeros( size=(len(all_bert_subword_sequences), 1)) bert_labels = torch.zeros_like(bert_input_ids) bert_alignment = torch.zeros_like(bert_input_ids) gold_original_token_labels = torch.zeros( size=(len(all_bert_subword_sequences), max_num_spacy_labels), dtype=torch.long) for idx, (bert_subword_sequence, bert_label_sequence, alignment, original_tokenization_label) \ in enumerate(zip(all_bert_subword_sequences, all_bert_label_sequences, all_bert_sequence_alignments, original_tokenization_labels)): if len(bert_subword_sequence) > 512: raise BaseException( "Error sequence at index %i as it is to long (%i tokens)" % (idx, len(bert_subword_sequence))) input_ids = tokenizer.convert_tokens_to_ids(bert_subword_sequence) attention_masks = [1] * len(input_ids) while len( input_ids ) < max_sequence_length: #pad bert aligned input until max length input_ids.append(0) attention_masks.append(0) bert_label_sequence.append(0) alignment.append(-1) while len( original_tokenization_label ) < max_num_spacy_labels: #pad spacy aligned input with -1 original_tokenization_label.append(-1) bert_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long) bert_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long) bert_alignment[idx] = torch.tensor(alignment, dtype=torch.long) bert_sequence_lengths[idx] = torch.tensor(sum( [1 for x in input_ids if x != 0]), dtype=torch.long) gold_original_token_labels[idx] = torch.tensor( original_tokenization_label, dtype=torch.long) bert_labels[idx] = torch.tensor(bert_label_sequence, dtype=torch.long) for i in range(1, len(bert_labels[idx]) - 1): # print() # print(f"Bert Labels | {i} | {bert_labels[idx][i]}") # print(f"Correct Original Labels | {i} | {gold_original_token_labels[idx][bert_alignment[idx][i]]}") # print(f"Bert Labels: {bert_labels[idx]}") # print(f"Spacy Labels: {gold_original_token_labels[idx]}") # print(f"Bert Alignment: {bert_alignment[idx]}") try: assert bert_labels[idx][i] == gold_original_token_labels[ idx][bert_alignment[idx][i]] except BaseException: pass if save_directory: torch.save(bert_input_ids, os.path.join(save_directory, f"bert_input.pt")) #bert input ids torch.save( bert_attention_masks, os.path.join(save_directory, f"bert_attention_mask.pt")) #bert attention masks torch.save(bert_sequence_lengths, os.path.join(save_directory, f"bert_sequence_length.pt") ) #length of actual bert sequence torch.save(bert_labels, os.path.join(save_directory, f"bert_labels.pt") ) #correct labels relative to bert tokenization torch.save(gold_original_token_labels, os.path.join(save_directory, f"spacy_labels.pt") ) #correct labels relative to spacy tokenization torch.save(bert_alignment, os.path.join(save_directory, f"subword_to_spacy_alignment.pt") ) #alignment between bert and spacy sequences torch.save(biluo_ordered_labels, os.path.join(save_directory, 'entity_names.pl')) #entity labels torch.save(loss_weights, os.path.join( save_directory, 'loss_weights.pt')) #global entity class counts return (bert_input_ids, None, bert_attention_masks), bert_sequence_lengths, bert_labels, original_tokenization_labels, \ bert_alignment, biluo_ordered_labels, loss_weights
def train_spacy_model(train_data, test_data, model, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" nlp = model ent_types = [] for _, e in train_data: ee = [ent[2] for ent in e['entities']] ent_types += ee for text, ent in train_data: doc = nlp(text) entities = ent['entities'] tags = biluo_tags_from_offsets(doc, entities) # # if "-" in tags: # print(text) # print(entities, tags) # for t in doc: # print(t, tags[t.i]) # print("\n\n\n") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model # if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print(f"{itn}:") print("\tLosses", losses) score = evaluate(nlp, test_data) if not os.path.isdir("models"): os.mkdir("models") nlp.to_disk(os.path.join("models", f"model_{itn}")) print("\t", score)
labels[split[0]] = [(int(split[1]), int(split[2]), split[3])] nlp = spacy.load("en_core_web_sm") ids = list(data.keys()) bio_tags = [] sentences = [] for id in ids: doc = nlp(data[id]) offsets = [] if id in labels.keys(): offsets = labels[id] labs = biluo_tags_from_offsets(doc, offsets) for sent in doc.sents: s = [] l = [] contains_positive = False for word in sent: s.append(word.lower_) label = labs[word.i] if label == '-': l.append("O") else: l.append(labs[word.i]) if labs[word.i] != 'O' and labs[word.i] != '-': contains_positive = True if len(s) > 150:
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, ) def inject_tokenizer(nlp): nlp.tokenizer = custom_tokenizer(nlp) return nlp nlp = inject_tokenizer(spacy.blank("en")) annotations_path = sys.argv[1] with open(annotations_path) as annotations: for line in annotations: entry = json.loads(line.strip()) # json schema # { # text: stores code # ents: stores type annotations in spacy NER format # cats: function return type (can have several if there are nested function definitions) # docstrings: stores docstrings, for main and nested functions # replacements: used for another project # } doc = nlp(entry['text']) # store tags = biluo_tags_from_offsets(doc, entry['ents']) for t, tag in zip(doc, tags): print(t.text, tag, sep="\t\t") print()