Beispiel #1
0
def match_sentences(all_sentences, matcher, match_lm):

    #match and tag all the sentences

    spacy_nlp = spacy.blank('en')

    raw_descs = []
    all_annos = []
    for job_desc in all_sentences:
        doc = match_lm(job_desc)
        matches = matcher(doc)

        if len(matches) > 0:
            almost_good = match_max_span(matches)

        else:
            almost_good = []
        sentence = Sentence(job_desc,
                            use_tokenizer=build_spacy_tokenizer(spacy_nlp))

        if len(almost_good) > 0:

            sentence = tag_it(almost_good, sentence)

        raw_descs.append(job_desc)
        all_annos.append(sentence)

    return raw_descs, all_annos
    def __init__(self,
                 model=None,
                 model_path: str = None,
                 entities_to_keep: List[str] = None,
                 verbose: bool = False,
                 labeling_scheme: str = "BIO",
                 compare_by_io: bool = True,
                 translate_to_spacy_entities=True):
        """
        Evaluator for Flair models
        :param model: model of type SequenceTagger
        :param model_path:
        :param entities_to_keep:
        :param verbose:
        :param labeling_scheme:
        :param compare_by_io:
        :param translate_to_spacy_entities:
        """
        super().__init__(entities_to_keep=entities_to_keep,
                         verbose=verbose,
                         labeling_scheme=labeling_scheme,
                         compare_by_io=compare_by_io)

        if model is None:
            if model_path is None:
                raise ValueError("Either model_path or model object must be supplied")
            self.model = SequenceTagger.load(model_path)
        else:
            self.model = model

        self.spacy_tokenizer = build_spacy_tokenizer(model=spacy.blank('en'))
        self.translate_to_spacy_entities = translate_to_spacy_entities

        if self.translate_to_spacy_entities:
            print("Translating entities using this dictionary: {}".format(PRESIDIO_SPACY_ENTITIES))
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".txt")
    ]
    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames,
                         unit=" txt",
                         desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences,
                                        colors=colors,
                                        title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"),
                      "w") as output:
                output.write(page_html)
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [
        filename for filename in os.listdir(data_folder)
        if filename.endswith(".xml")
    ]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML",
              desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(
                path=os.path.join(data_folder, filename),
                keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception(
            "No example loaded, causes: no cases in provided path or sample size is to high"
        )

    tagger: SequenceTagger = SequenceTagger.load(
        os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True,
                       embedding_storage_mode="cpu")

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html)
from flair.models import SequenceTagger
import sys
sys.path.append('../anonymisation-master')
from ner.model_factory import get_tokenizer
from resources.config_provider import get_config_default
from xml_extractions.extract_node_values import Paragraph, get_paragraph_from_file

random.seed(5)

tagger: SequenceTagger = SequenceTagger.load(
    'resources/flair_ner/ca/best-model.pt')

config_training = get_config_default()
nlp = spacy.blank('fr')
nlp.tokenizer = get_tokenizer(nlp)
tokenizer = build_spacy_tokenizer(nlp)

xml_train_path = "../similar_legal_case/data/jurica_original_xml/arrets-juri"  # config_training["xml_train_path"]
files = [
    os.path.join(path, name) for path, _, files in os.walk(xml_train_path)
    for name in files
]
random.shuffle(files)

with open("./resources/training_data/generated_annotations.txt",
          mode='w') as generated_text:
    with open("./resources/training_data/generated_annotations.ent",
              mode='w') as generated_entities:
        for filename in files:
            if filename.endswith(".xml"):
                try:
Beispiel #6
0
    company = job_listing.get('company')
    job_title = job_listing.get('job_title')
    location = job_listing.get('location')
    dice_id = job_listing.get('dice_id')
    if company.lower().find('vdart') >= 0:
        job_desc = re.sub(generic_re, '', job_desc)

    all_sentances = []
    doc = sent_nlp(job_desc)
    for sent in doc.sents:
        all_sentances.append(sent.string.strip())
    for sentance in all_sentances:
        if len(sentance) >= 5 and len(sentance) < 512:

            doc = Sentence(sentance,
                           use_tokenizer=build_spacy_tokenizer(sent_nlp))
            predictions = tagger.predict(doc)
            labels_dict = predictions[0].to_dict(tag_type='ner')

            all_entities = [item['text'] for item in labels_dict['entities']]
            embeddings.embed(doc)
            for token in doc:
                if token.text in all_entities:
                    tensor = token.embedding.detach().cpu().numpy()
                    skill_embeddings.append((token.text, tensor))

from sklearn.cluster import KMeans
import numpy as np

embeddings_df = pd.DataFrame(skill_embeddings, columns=['skill', 'embedding'])
embeddings_df['skill'] = embeddings_df['skill'].map(lambda x: x.lower())