def match_sentences(all_sentences, matcher, match_lm): #match and tag all the sentences spacy_nlp = spacy.blank('en') raw_descs = [] all_annos = [] for job_desc in all_sentences: doc = match_lm(job_desc) matches = matcher(doc) if len(matches) > 0: almost_good = match_max_span(matches) else: almost_good = [] sentence = Sentence(job_desc, use_tokenizer=build_spacy_tokenizer(spacy_nlp)) if len(almost_good) > 0: sentence = tag_it(almost_good, sentence) raw_descs.append(job_desc) all_annos.append(sentence) return raw_descs, all_annos
def __init__(self, model=None, model_path: str = None, entities_to_keep: List[str] = None, verbose: bool = False, labeling_scheme: str = "BIO", compare_by_io: bool = True, translate_to_spacy_entities=True): """ Evaluator for Flair models :param model: model of type SequenceTagger :param model_path: :param entities_to_keep: :param verbose: :param labeling_scheme: :param compare_by_io: :param translate_to_spacy_entities: """ super().__init__(entities_to_keep=entities_to_keep, verbose=verbose, labeling_scheme=labeling_scheme, compare_by_io=compare_by_io) if model is None: if model_path is None: raise ValueError("Either model_path or model object must be supplied") self.model = SequenceTagger.load(model_path) else: self.model = model self.spacy_tokenizer = build_spacy_tokenizer(model=spacy.blank('en')) self.translate_to_spacy_entities = translate_to_spacy_entities if self.translate_to_spacy_entities: print("Translating entities using this dictionary: {}".format(PRESIDIO_SPACY_ENTITIES))
def main(data_folder: str, output_folder: str, model_folder: str) -> None: nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".txt") ] tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"): with open(os.path.join(data_folder, filename), 'r') as input_f: sentences = tagger.predict(sentences=input_f.readlines(), mini_batch_size=32, verbose=False, use_tokenizer=tokenizer) case_name = filename.split('.')[0] page_html = render_ner_html(sentences, colors=colors, title=case_name) with open(os.path.join(output_folder, case_name + ".html"), "w") as output: output.write(page_html)
def main(data_folder: str, model_folder: str, top_n: int) -> None: print(f"keep only top {top_n} examples per file") nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".xml") ] sentences: List[Sentence] = list() with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar: for filename in filenames: paragraphs: List[Paragraph] = get_paragraph_from_file( path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True) if len(paragraphs) > top_n: for paragraph in paragraphs[:top_n]: if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) sentences.append(s) progress_bar.update() if len(sentences) == 0: raise Exception( "No example loaded, causes: no cases in provided path or sample size is to high" ) tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) _ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True, embedding_storage_mode="cpu") print("prepare html") page_html = render_ner_html(sentences, colors=colors) print("write html") with open("sentence.html", "w") as writer: writer.write(page_html)
from flair.models import SequenceTagger import sys sys.path.append('../anonymisation-master') from ner.model_factory import get_tokenizer from resources.config_provider import get_config_default from xml_extractions.extract_node_values import Paragraph, get_paragraph_from_file random.seed(5) tagger: SequenceTagger = SequenceTagger.load( 'resources/flair_ner/ca/best-model.pt') config_training = get_config_default() nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) xml_train_path = "../similar_legal_case/data/jurica_original_xml/arrets-juri" # config_training["xml_train_path"] files = [ os.path.join(path, name) for path, _, files in os.walk(xml_train_path) for name in files ] random.shuffle(files) with open("./resources/training_data/generated_annotations.txt", mode='w') as generated_text: with open("./resources/training_data/generated_annotations.ent", mode='w') as generated_entities: for filename in files: if filename.endswith(".xml"): try:
company = job_listing.get('company') job_title = job_listing.get('job_title') location = job_listing.get('location') dice_id = job_listing.get('dice_id') if company.lower().find('vdart') >= 0: job_desc = re.sub(generic_re, '', job_desc) all_sentances = [] doc = sent_nlp(job_desc) for sent in doc.sents: all_sentances.append(sent.string.strip()) for sentance in all_sentances: if len(sentance) >= 5 and len(sentance) < 512: doc = Sentence(sentance, use_tokenizer=build_spacy_tokenizer(sent_nlp)) predictions = tagger.predict(doc) labels_dict = predictions[0].to_dict(tag_type='ner') all_entities = [item['text'] for item in labels_dict['entities']] embeddings.embed(doc) for token in doc: if token.text in all_entities: tensor = token.embedding.detach().cpu().numpy() skill_embeddings.append((token.text, tensor)) from sklearn.cluster import KMeans import numpy as np embeddings_df = pd.DataFrame(skill_embeddings, columns=['skill', 'embedding']) embeddings_df['skill'] = embeddings_df['skill'].map(lambda x: x.lower())