def convert_offsets_to_spacy_docs(doc_annotated: list) -> list: """ Convert a list of tuple of string with their offset to Spacy doc with entities ready :param doc_annotated: list of tuple (string, array of offsets) :return: list of spacy doc """ model = get_empty_model(load_labels_for_training=False) docs = list() for (index, (case_id, text, tags)) in enumerate(doc_annotated): doc: Doc = model.make_doc(text) ents = list() for (start_offset, end_offset, type_name) in tags: span_doc = doc.char_span(start_offset, end_offset, label=type_name) if span_doc is not None: ents.append(span_doc) else: print("Issue in offset", "Index: " + str(index), "case: " + case_id, text[start_offset:end_offset], text, sep="|") doc.ents = ents docs.append(doc) return docs
def annotate(model_dir_path: str, files_dir_path: List[str], out_dir_path: str) -> None: """ Annotate a sample of the given XML files and save them into the given directory. :param model_dir_path: the directory of the Spacy model :param files_dir_path: the directory containing the XML files :param out_dir_path: the directory where to write the annotations """"Loading NER model…") nlp = get_empty_model(load_labels_for_training=False) nlp = nlp.from_disk(model_dir_path) # TODO remove when we have retrained infixes = nlp.Defaults.infixes + [r':', r"(?<=[\W\d_])-|-(?=[\W\d_])"] infixes_regex = spacy.util.compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infixes_regex.finditer # end of deletion above entity_typename_builder = EntityTypename()"Loading cases…") cases: List[Case] = list() for path in files_dir_path: if path.endswith(".xml"): case: Case = get_paragraph_from_file( path=path, keep_paragraph_without_annotation=True) cases.append(case) elif path.endswith(".txt"): with open(path) as f: lines = f.readlines() case: Case = list() for line in lines: clean_text = line.strip() if len(clean_text) > 1: basename = os.path.basename(path) basename = basename.split(".")[0] case.append( Paragraph(basename, clean_text, list(), list())) cases.append(case)
def train_model(data: list, folder_to_save_model: str, n_iter: int, batch_size: int, dropout_rate: float): """ Train a NER model using Spacy :param data: list of tuples [(text, offset)] :param folder_to_save_model: Where to save the learned model. None to skip. Will be overiden with new model :param n_iter: number iterations of the CNN :param batch_size: more = less precise / less time to learn :param dropout_rate: more : learn less / better generalization """ nlp = get_empty_model(load_labels_for_training=True) = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() with tqdm(total=n_iter * ceil(len(data) / batch_size), unit=" paragraphs", desc="Learn NER model") as pbar: for itn in range(n_iter): pbar.set_description(f"Learn NER model - iteration {itn + 1}") losses = {} random.shuffle(data) batches = util.minibatch(data, batch_size) for current_batch_item in batches: case_id, texts, annotations = zip(*current_batch_item) docs = [nlp.make_doc(text) for text in texts] gold_with_unknown_bilou = convert_unknown_bilou_bulk( docs=docs, offsets=annotations) nlp.update( docs, # batch of texts gold_with_unknown_bilou, # batch of annotations drop= dropout_rate, # dropout - make it harder to memorise rules sgd=optimizer, # callable to update weights losses=losses) pbar.postfix = "loss: " + str(losses['ner']) pbar.update() # save model to output directory if folder_to_save_model is not None: folder_to_save_model = Path(folder_to_save_model) nlp.to_disk(folder_to_save_model)
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from spacy.tokens.doc import Doc from match_text_unsafe.build_entity_dictionary import EntityTypename from misc.convert_to_bilou import convert_unknown_bilou, convert_unknown_bilou_bulk, no_action_bilou from ner.model_factory import get_empty_model import pytest pytest.nlp = get_empty_model(load_labels_for_training=True) def test_bilou_conv(): doc: Doc = pytest.nlp.make_doc("Ceci est un test.") offset1 = [(5, 8, "UNKNOWN")] assert convert_unknown_bilou( doc, offsets=offset1).ner == ['O', '-', 'O', 'O', 'O'] assert convert_unknown_bilou_bulk( [doc], [offset1])[0].ner == ['O', '-', 'O', 'O', 'O'] offset2 = [(5, 8, "PERS")] assert convert_unknown_bilou( doc, offsets=offset2).ner == ['O', 'U-PERS', 'O', 'O', 'O'] offset3 = [(0, 4, "UNKNOWN")] assert convert_unknown_bilou( doc, offsets=offset3).ner == ['-', 'O', 'O', 'O', 'O']
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from xml_extractions.extract_node_values import get_paragraph_from_file from ner.model_factory import get_empty_model from resources.config_provider import get_config_default config_training = get_config_default() model_dir_path = config_training["model_dir_path"] xml_dev_path = config_training["xml_dev_path"] nlp = get_empty_model(load_labels_for_training=False) nlp = nlp.from_disk(model_dir_path) DEV_DATA = get_paragraph_from_file(xml_dev_path, keep_paragraph_without_annotation=True) for case_id, texts, xml_extracted_text, annotations in DEV_DATA: doc = nlp(texts) spacy_extracted_text_ad_pp = [ ent.text for ent in doc.ents if ent.label_ in ["ADDRESS", "PERS"] ] spacy_extracted_text = [ent.text for ent in doc.ents] str_rep_spacy = ' '.join(spacy_extracted_text) match = [span_xml in str_rep_spacy for span_xml in xml_extracted_text]
def main(data_folder: str, model_path: Optional[str], output_model: Optional[str], dev_size: float, nb_epochs: int, print_diff: bool) -> None: nlp = get_empty_model(load_labels_for_training=True) if model_path is not None: nlp = nlp.from_disk(path=model_path) nlp.tokenizer = get_tokenizer(nlp) # replace tokenizer nlp.begin_training() # ner = nlp.get_pipe("ner") # ner.model.learn_rate = 0.0001 else: nlp.tokenizer = get_tokenizer(nlp) # replace tokenizer nlp.begin_training() all_annotated_files: List[str] = [ os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt") ] random.shuffle(all_annotated_files) nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size) dev_file_names = all_annotated_files[0:nb_doc_dev_set] train_file_names = [ file for file in all_annotated_files if file not in dev_file_names ] # train_file_names = ["./resources/training_data/generated_annotations.txt"] + train_file_names content_to_rate = load_content(txt_paths=train_file_names) content_to_rate_test = load_content(txt_paths=dev_file_names) print( f"nb PERS entities {sum([1 for _, offsets in content_to_rate for o in offsets if o.type == 'PERS'])}" ) if model_path is not None: print("evaluation without fine tuning") spacy_evaluate(nlp, content_to_rate_test, print_diff) optimizer: Optimizer = nlp.resume_training() for epoch in range(nb_epochs): print(f"------- {epoch} -------") random.shuffle(content_to_rate) losses = dict() batches = minibatch(content_to_rate, size=compounding(4., 16., 1.001)) for batch_id, batch in enumerate( tqdm(iterable=batches, unit=" batches", desc="Training")): try: batch_gold = convert_batch_to_gold_dataset(model=nlp, batch=batch) texts, manual_annotations = zip( *batch_gold) # type: List[str], List[GoldParse] nlp.update(texts, manual_annotations, drop=0.5, losses=losses, sgd=optimizer) if batch_id % 10000 == 0: spacy_evaluate(model=nlp, dev=content_to_rate_test, print_diff=print_diff) except Exception as e: print(f"got exception [{e}] on batch id {batch_id}") print(f"Epoch {epoch + 1}\nLoss: {losses}\n") spacy_evaluate(model=nlp, dev=content_to_rate_test, print_diff=print_diff) if output_model is not None: nlp.to_disk(output_model)