class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
    labelled as ORG and their spans are merged into one token. Additionally,
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
    respectively."""
    name = 'tech_companies'  # component name, will show up in the pipeline

    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_tech_org', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_tech_org(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_tech_org') for t in tokens])
Beispiel #2
0
def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    assert len(matcher) == 2
Beispiel #3
0
def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
    matcher.add("TEST2", None, nlp("d"))
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)
Beispiel #4
0
def get_matches(tokenizer, phrases, texts, max_length=6):
    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
        for ent_id, start, end in matches:
            yield (ent_id, doc[start:end].text)
Beispiel #5
0
def test_issue3331(en_vocab):
    """Test that duplicate patterns for different rules result in multiple
    matches, one per rule.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
    matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
    matches = matcher(doc)
    assert len(matches) == 2
    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
    assert sorted(match_ids) == ["A", "B"]
Beispiel #6
0
def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """
    name = 'rest_countries' # component name, will show up in the pipeline

    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)


    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set('is_country', True)
                token._.set('country_capital', self.countries[entity.text]['capital'])
                token._.set('country_latlng', self.countries[entity.text]['latlng'])
                token._.set('country_flag', self.countries[entity.text]['flag'])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_country') for t in tokens])
def test_phrase_matcher_basic_check(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    # Potential mistake: pass in pattern instead of list of patterns
    pattern = Doc(en_vocab, words=["hello", "world"])
    with pytest.raises(ValueError):
        matcher.add("TEST", pattern)
def test_phrase_matcher_sent_start(en_vocab, attr):
    _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
    def __init__(self, nlp, ontoDict):
        # add ontology and label from ontoDict
        self.ontoDict = ontoDict
        self.all_labels = ""

        # stop words, don't try to match these
        stopwords = nlp.Defaults.stop_words
        stopwords.add("ands")
        stopwords.add("ends")
        stopwords.add("ci")

        self.ontols = []

        ontologies = ontoDict["ontologies"]
        for ontology in ontologies:
            for key, value in ontology.items():
                if (key == "label"):
                    self.all_labels = self.all_labels + value
                if (key == "ontology"):
                    self.ontols.append(value)
        # print("self.ontols: ", self.ontols)
        # for x in self.ontols:
        #     print("got x: ", x)
        # print("all_labels = ", self.all_labels)

        # for making plural forms of labels for text matching
        engine = inflect.engine()

        # init terms and patterns
        self.terms = {}
        patterns = []

        #build unified table of all ID, IRI, Label and Synonyms:
        for ontol in self.ontols:  #should be all ontols in
            print("checking ontol: ", ontol)
            for termid in ontol.get_classes():
                # print("k is: ", k)
                termshortid = ontol.get_id_for_iri(termid)

                label = ontol.get_annotation(termid, RDFSLABEL)
                definition = ontol.get_annotation(termid, DEFINITION)
                if label:
                    term_entry = {
                        'id': termid if termshortid is None else termshortid,
                        'name': label.strip(),
                        'definition': definition
                    }
                if label is not None and label.strip().lower(
                ) not in stopwords:
                    self.terms[label.strip().lower()] = term_entry
                    patterns.append(nlp.make_doc(label.strip().lower()))
                    plural = engine.plural(label.strip())
                    self.terms[plural.lower()] = term_entry
                    patterns.append(nlp.make_doc(plural.lower()))
                synonyms = ontol.get_annotations(termid, SYN)
                for s in synonyms:
                    # print("adding SYNONYM in ontotagtext: ", s)
                    if s.strip().lower() not in stopwords:
                        self.terms[s.strip().lower()] = term_entry
                        patterns.append(nlp.make_doc(s.strip().lower()))
                        try:
                            plural = engine.plural(s.strip().lower())
                            self.terms[plural.lower()] = term_entry
                            patterns.append(nlp.make_doc(plural.lower()))
                        except:
                            print("Problem getting plural of ", s)
                            continue

        # initialize matcher and add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(self.all_labels, None, *patterns)

        # set extensions to tokens, spans and docs
        Token.set_extension("is_ontol_term", default=False, force=True)
        Token.set_extension("ontol_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_ontols", getter=self.has_ontols, force=True)
        Doc.set_extension("ontols", default=[], force=True)
        Span.set_extension("has_ontols", getter=self.has_ontols, force=True)
Beispiel #13
0
class key_word_recognizer(object):
    """Initialise the pipeline component. The shared nlp instance is used
    to initialise the matcher with the shared vocab, get the label ID and
    generate Doc objects as phrase match patterns.
    """
    def __init__(self,
                 nlp,
                 keywords,
                 label,
                 tokentag,
                 doctag=None,
                 spantag=None):
        nlp.vocab.strings.add(label)
        self.label = nlp.vocab.strings[label]
        self._label_str = label
        self._token_tag = tokentag
        self._doctag = doctag
        self._spantag = spantag
        self._keywordtag = "is_keyword"
        self._labeltag = "label_"
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(key) for key in keywords]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(self._token_tag, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension(self._token_tag, default=False)
        if not Token.has_extension(self._keywordtag):
            Token.set_extension(self._keywordtag, default=False)
            Token.set_extension(self._labeltag, default=None)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension(self._doctag,
                          getter=lambda tokens: any(
                              [t._.get(self._token_tag) for t in tokens]))
        Span.set_extension(self._spantag,
                           getter=lambda tokens: any(
                               [t._.get(self._token_tag) for t in tokens]))
        if not Span.has_extension("dep_"):
            Span.set_extension("dep_", default="")
            Span.set_extension("head_", default=None)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards

        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set(self._token_tag, True)
                token._.set(self._labeltag, self._label_str)
                entity._.set("dep_", token.dep_)
                entity._.set("head_", {
                    "text": token.head.text,
                    "index": token.head.i
                })

                if not token._.get(self._keywordtag):
                    token._.set(self._keywordtag, True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            # print(doc.ents)
            # print(entity)
            if not entity in list(doc.ents):
                doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc
Beispiel #14
0
import json
from spacy.lang.es import Spanish
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/es/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="LOC") for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
Beispiel #15
0
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nlp import nlp as nlp
from collections import Counter
from fuzzywuzzy import fuzz
import Levenshtein as lev
import spacy
Spnlp = spacy.load("en_core_web_sm")
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(Spnlp.vocab)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

LangProcessor = nlp()


# %%
#load the job description

with open('identity.txt') as job:
    text = job.read()    


# %%
#load cv

with open('cv') as cv:
Beispiel #16
0
nlp = spacy.load('en_core_web_sm')

doc = nlp(
    "A” login to his application, he should only see things pertaining to him and not things pertaining to “Mr. B” "
)

# for token in doc:
#     print(token)

print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-" * 40)
for token in doc:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", patterns)

text_doc = nlp(
    "Glowing review overall, and some really interesting side-by-side "
    "photography tests pitting the iPhone 11 Pro against the "
    "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.")

matches = matcher(text_doc)
print(matches)

match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])
Beispiel #17
0
def create_Data_Scientist_profile(file):
    text = pdfextract(file) 
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    keyword_dict = pd.read_csv('data_science_keywords.csv')
    
    keyword_total = list(keyword_dict.count())
    global total_sum
    total_sum = 0
    for i in keyword_total:
        total_sum = total_sum + i
        
    print('ee',total_sum)
    
    
    
    stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
    NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
    ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
    DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
    R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
    python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
    Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('Stats', None, *stats_words)
    matcher.add('NLP', None, *NLP_words)
    matcher.add('ML', None, *ML_words)
    matcher.add('DL', None, *DL_words)
    matcher.add('R', None, *R_words)
    matcher.add('Python', None, *python_words)
    matcher.add('DE', None, *Data_Engineering_words)
    doc = nlp(text)
    
    d = []  
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start : end]  # get the matched slice of the doc
        d.append((rule_id, span.text))      
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
    
    ## convertimg string of keywords to dataframe
    df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) 
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
    
    base = os.path.basename(file)
    filename = os.path.splitext(base)[0]
       
    name = filename.split('_')
    name2 = name[0]
    name2 = name2.lower()
    ## converting str to dataframe
    name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
    
    dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
    print(dataf)
    return(dataf)
Beispiel #18
0
def create_web_dev_profile(file):
    text = pdfextract(file) 
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    keyword_dict = pd.read_csv('web_developer_keywords.csv')
    keyword_total = list(keyword_dict.count())
    global total_sum
    total_sum = 0
    for i in keyword_total:
        total_sum = total_sum + i
        
    print('ee',total_sum)
    
    front_end = [nlp(text) for text in keyword_dict['Front End'].dropna(axis = 0)]
    back_end = [nlp(text) for text in keyword_dict['Back End'].dropna(axis = 0)]
    database = [nlp(text) for text in keyword_dict['Database'].dropna(axis = 0)]
    project = [nlp(text) for text in keyword_dict['Projects'].dropna(axis = 0)]
    frameworks = [nlp(text) for text in keyword_dict['Frameworks'].dropna(axis = 0)]
    
    #print(front_end)
   # print(back_end)
    #print(database)
   
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('FrontEnd', None, *front_end)
    matcher.add('BackEnd', None, *back_end)
    matcher.add('Database', None, *database)
    matcher.add('Projects', None, *project)
    matcher.add('Frameworks', None, *frameworks)
 
    doc = nlp(text)
    #print(doc)
    
    d = []  
    matches = matcher(doc)
   # print(matches)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start : end]  # get the matched slice of the doc
        d.append((rule_id, span.text))      
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
    
    ## convertimg string of keywords to dataframe
    df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) 
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
    
    base = os.path.basename(file)
    filename = os.path.splitext(base)[0]
       
    name = filename.split('_')
    name2 = name[0]
    name2 = name2.lower()
    ## converting str to dataframe
    name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
    
    dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
    print(dataf)
    return(dataf)
Beispiel #19
0
 def __init__(self, nlp, terms):
   self.terms = terms
   self.matcher = PhraseMatcher(nlp.vocab)
   patterns = [nlp.make_doc(text) for text in terms]
   self.matcher.add("TerminologyList", None, *patterns)
   Doc.set_extension("phrase_matches", getter=self.matcher, force=True)
Beispiel #20
0
class CkSpacyModel():

    def __init__(self, xml_dir, output_dir, section_names):
        self.xml_dir = xml_dir
        self.output_dir = output_dir
        self.section_names = section_names
        self.__current_xml_files_for_spacy_preprocessing = []
        self.__filenames = []
        self._TEXTS = []
        self._current_TEXTS_idx = 0
        self.nlp = spacy.load('en_core_web_md')
        self.ruler = EntityRuler(self.nlp,overwrite_ents=True  ).from_disk("./patterns.jsonl")
        #self.ruler = EntityRuler(self.nlp)
        self._current_sentence_idx = 0
        self.TRAIN_DATA = []
        self.stringstore = 0
        self.matcher = Matcher(self.nlp.vocab)
        Token.set_extension("is_unit",  getter= self.is_unit)
        Token.set_extension("alt_text", default = None) #  getter= self.get_alt_text)
        Token.set_extension("alt_text_keep", default = True) #  whether this word should be keeped in the alternative text (necessary because of trailing whitespaces))
        Token.set_extension("alt_text_trailing_whitespace_", default = " ")
        self.matcher_units = PhraseMatcher(self.nlp.vocab) # der PhraseMatcher fuer die Uniterkennung fuer alternative words
        self.matcher_alt_text = Matcher(self.nlp.vocab)
        self.pattern_file_custom_matcher_alt_text = "./Lib/units.jsonl"

    def pre_process(self):
        print('starting preprocess')   
        self.nlp.add_pipe(self.ruler, after="ner")
        self.nlp.add_pipe(self.custom_pipe_component_phrase_entity, before="ner")
        #self.nlp.add_pipe(self.custom_pipe_component_Name_et_al, after="ner")
        #self.nlp.add_pipe(self.custom_pipe_component_Quantity, last=True)
        #self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit, last=True)

        # lade die pattern in den Matcher
        self.custom_matcher_alt_text()
#        self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit_text, last=True)
        self.nlp.add_pipe(self.custom_pipe_comp_alt_text, last = True)
        # als letztes kommt dann die Wortersetzung fuer das simplified english ... 10 mg = xy mg

        self.extract_text()
        
    def reintegrate_patterns_to_ruler(self, file):
        self.ruler = EntityRuler(self.nlp).from_disk(file)
        #self.nlp.remove_pipe("ruler")
        self.nlp.replace_pipe("entity_ruler", self.ruler)
        #self.nlp.add_pipe(self.ruler, before="ner")

        #* The entity ruler is designed to integrate with spaCy’s existing statistical models 
        #* and enhance the named entity recognizer. If it’s added before the "ner" component, 
        #* the entity recognizer will respect the existing entity spans and adjust its 
        #* predictions around it. This can significantly improve accuracy in some cases. 
        #* If it’s added after the "ner" component, the entity ruler will only add spans to 
        #* the doc.ents if they don’t overlap with existing entities predicted by the model. 
        #* To overwrite overlapping entities, you can set overwrite_ents=True on initialization.




    def show_ents(self, doc):
        if doc.ents:
            for ent in doc.ents:
                print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
        else:
            print('No named entities found.')

    def get_next_sentence(self):
        self._current_TEXT = self._TEXTS[self._current_TEXTS_idx]
        self._current_doc = self.nlp(self._current_TEXT)
        sentences = list(self._current_doc.sents)
        sentence = sentences[self._current_sentence_idx]
        if self._current_sentence_idx < len(sentences)-1:
            self._current_sentence_idx += 1
        else:
            self._current_sentence_idx = 0
            print('next document')
            if self._current_TEXTS_idx < len(self._TEXTS)-1:
                self._current_TEXTS_idx += 1
            else:
                print('end of Text list')
        sentence = self.nlp(sentence.text)
        unknown_words = []
        for token in sentence:
            #print("check for : " + token.text)
            if token.is_oov:
                unknown_words.append(token)
                #print("not found: " + token.text)
            print(f"token.text = {token.text:{18}} : token._.alt_text = {token._.alt_text:{10}}")
        
        return (sentence, unknown_words)

    def add_pattern_to_entity_ruler(self,patterns,file):
        # die Prufung auf gleiche Lines hab ich nicht hinbekommen
        # daher pruefung auf doppelte und Loeschung von diesen
        self.ruler.add_patterns(patterns)
        self.ruler.to_disk(file)
        uniqlines = set(open(file).readlines())
        with open(file,'w',encoding='utf8') as fp:
            for line in uniqlines:
                fp.write(line)

    def add_sentence_to_TRAIN_DATA(self,sentence, filename):
        exists = os.path.isfile(filename)
        if exists:
            with open(filename,'r',encoding='utf8') as fh:
                for line in fh:
                    one_line = line[:-1]
                    self.TRAIN_DATA.append(one_line)
        self.TRAIN_DATA.append(sentence)
        if exists:
            # haenge nur den einen aktuellen Listeneintrag an
            with open(filename,'a',encoding='utf8') as fh:
                listitem = self.TRAIN_DATA.pop()
                fh.write('%s\n' % listitem)
        if not exists:
            with open(filename,'w+',encoding='utf8') as fh:
                for listitem in self.TRAIN_DATA:
                    fh.write('%s\n' % listitem)

    def add_word_to_stringstore(self, word, path):
        try:
            self.stringstore = StringStore().from_disk(path)
            self.stringstore.add(word)
        except:
            self.stringstore = StringStore(word)
        self.stringstore.to_disk(path)


    def add_word_to_vocab_permanently(self,word):
        pass
    
    def add_word_to_vocab_temporarely(self, word):
        pass

    def add_stringstore_to_vocab_temporarely(self, file):
        try:
            self.stringstore = StringStore().from_disk(file)
            for word in self.stringstore:
                lex = self.nlp.vocab[word]
                self.nlp.vocab[word].is_oov = False
        except:
            print("cannot read stringstore in file " + file)
    

    def add_pattern_jsonl_file_to_vocab_and_entity_matcher(self, pattern_file):
        (ents, pattern) = self.read_gazetteer(pattern_file)
        for i in range(len(ents)-1):
            #print(ents[i])
            #print(pattern[i])
            #print(type(pattern[i]))
            self.matcher.add(ents[i], None, pattern[i])
#           self.matcher.add(entity, None, *phrases)

    

    def read_gazetteer(self, loc):
        pattern = []
        ents = []
        idx = 0
        for i, line in enumerate(open(loc)):
            idx +=1
            data = eval(line.strip())
#            data = json.loads(line.strip())
            # ich fuege zum Vocab den String
            #phrase = self.nlp.tokenizer(data["pattern"])
            #phrase = data["pattern"][0]
            ents.append(data["label"])
            # ich fuege zum matcher das pattenr
            pattern.append(data["pattern"])

            # adde die Worte zum vocab
            #print(f"laenge der phrases = {len(phrases)}")
    #        print(phrase)
            try:
                phrase = ["pattern"][1]["lower"]
                for w in phrase:
                    _ = self.nlp.tokenizer.vocab[w.text]
            except:
                pass
        return (ents, pattern)
        # for i, line in enumerate(open(loc)):
        #     data = json.loads(line.strip())
        #     #! dann duerfen es aber nur einzelne Worte sein
        #     phrase = self.nlp.tokenizer(data["pattern"])
        #     # adde die Worte zum vocab
        #     print(f"laenge der phrases = {len(phrase)}")
        #     for w in phrase:
        #         _ = self.nlp.tokenizer.vocab[w.text]
        #     if len(phrase) >= 2:
        #         yield phrase

#*___________________________________________________________
#*___________________________________________________________
    #* CUSTOM PIPE COMPONENTS     
    #* Hier kommen die Cusom Pipe Components
    #*Aufgabe hauptsaechlich Entitaeten mittels Matchern zu verbessern
    #*Diese werden in der Funktion preproces in die Pipeline integriert
    
    def custom_pipe_component_phrase_entity(self, doc):
        # for ent in doc.ents:
        #     print(ent.text)
        # Apply the matcher to the doc
        matches = self.matcher(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
        # Overwrite the doc.ents with the matched spans

        try:
            doc.ents = list(doc.ents) + spans
        except:
            print(f"overlapping Entities with {spans}")
#        doc.ents = spans
        return doc     

    def custom_pipe_component_Name_et_al(self, doc):
        print("entering_custom_pipe_component Name et al")
        new_ents = [] 
        for ent in doc.ents:
            print(f"ent = {ent}")
            # Only check for title if it's a person and not the first token
            replaced = False
        
            if ent.label_ == "PERSON":# and ent.end<len(doc)-2:
                # gib das neue label if et al. is in person or after Person
                if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text):
                    new_ent = Span(doc, ent.start, ent.end, label="REF")
                    replaced = True
                    print("new ents")
                else:
                    # wir schauen ob die danach folgenden et al sind
                    print("within label Person")
                    next_token = doc[ent.end +  1]
                    next_next_token = doc[ent.end + 2]
                    print(next_token.text)
                    print(next_next_token.text)
                    if next_token.text == "et" and next_next_token.text in ("al.", "al"):
                        new_ent = Span(doc, ent.start, ent.end+2, label="REF")
                        new_ents.append(new_ent)
                        replaced = True
                        print("new_ent")


            # es wird das neue angehangen
            if replaced:
                new_ents.append(new_ent)
                print('new ent')
            else:
            # es wird die alte Entitaet uveraendert uebertragen
                new_ents.append(ent)
                print("old ents")
            
        doc.ents = new_ents
        print(doc.ents)
        return doc     

    def custom_pipe_component_Quantity(self, doc):
       # 10 mg macht er meist als 10(CARDINAL) mg
       # Ziel 10 mg (QUANTITY)
        print("entering_custom_pipe_component Quantity")
        print(doc.text)
        new_ents = []
        for ent in doc.ents:
            print(ent.text)
            print(ent.label_)
            # Only check for title if it's a person and not the first token
            replaced = False
            if ent.label_ == "CARDINAL":# and ent.end<len(doc)-2:
                next_token = doc[ent.end]
                if next_token.text in ["mg", "g"]:
                    new_ent = Span(doc, ent.start, ent.end+1, label="QUANTITY")
                    replaced = True
            # es wird das neue angehangen
            if replaced:
                new_ents.append(new_ent)
                print('new ent')
            else:
            # es wird die alte Entitaet uveraendert uebertragen
                new_ents.append(ent)
                print("old ents")


        try:
            doc.ents = new_ents
        except:
            print("overlapping Entities in Quantity")
            for ent in new_ents:
                print(f"ent = {ent.text}   start = {ent.start}   stop = {ent.end}  label = {ent.label_}")
        #print(doc.ents)
        return doc     



    def custom_pipe_component_set_extension_unit(self, doc):
        pass
#*___________________________________________________________
#*___________________________________________________________
    #* EXTENSION Methods
    # Hier kommen die EXTENSION Methods
    # Hauptaufgabe ist das setzen von user defined Attributes, Propertien and Methods
    #Hauptziel fuer bestimmte Tokens ein neues text Token mit simplified english 
    #zu
    

    def custom_pipe_comp_alt_text(self, doc):
        # setze standardmaessig den alternativ text auf den Orginaltext
        for token in doc:
            token._.alt_text = token.text
            token._.alt_text_trailing_whitespace_ = token.whitespace_
        # nun wird der Matcher aufgerufen, der nach verschiedenen Regeln sucht
        # diese gefundenen Regeln werden danach abgefangen und der Alternativtext
        # wird entsprechend dieser Regeln gesetzt
        matches = self.matcher_alt_text(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        for match_id, start, end in matches:
            # Zahl die allein steht und als ent Type Cardinal ist
            if self.nlp.vocab.strings[match_id]=="NUMCARDINAL":
                doc[start]._.alt_text = "NUM"

            # UNITS
            # Wenn UNITS allein stehen 
            if self.nlp.vocab.strings[match_id]=="UNITS":
                doc[start]._.alt_text = "UNITS"
            # Wenn Units nach einer Zahl als eigenes Token stehen
            if self.nlp.vocab.strings[match_id]=="NUM_UNIT":
                doc[start]._.alt_text = "99"
                doc[start+1]._.alt_text = "UNITS" 
            # WEnn Units nach einer Zahl in einem Token stehen
            if self.nlp.vocab.strings[match_id]=="NUMUNIT": # zahl und Einheit wurde zusammen geschrieben
                doc[start]._.alt_text = "99UNITS"

            if self.nlp.vocab.strings[match_id]=="DRUGNAME":
                doc[start]._.alt_text = "DRUGNAME"
            if self.nlp.vocab.strings[match_id]=="NAMEETAL":
                doc[start]._.alt_text = "REF"
                doc[start+1]._.alt_text = "not to keep"
                doc[start+1]._.alt_text_keep = False
                doc[start+2]._.alt_text = "not to keep"
                doc[start+2]._.alt_text_keep = False
                doc[start+3]._.alt_text = "not to keep"
                doc[start+3]._.alt_text_keep = False
                
            if self.nlp.vocab.strings[match_id]=="REFx":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="REFS":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="REFpunkt":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="XYMIN":
                doc[start]._.alt_text = "XYMIN"
            if self.nlp.vocab.strings[match_id]=="XY-YEARREG":
                doc[start]._.alt_text = "99-year"
            if self.nlp.vocab.strings[match_id]=="XYYEARREG":
                doc[start]._.alt_text = "99year"
            if self.nlp.vocab.strings[match_id]=="XYMINREG":
                doc[start]._.alt_text = "99min"
            if self.nlp.vocab.strings[match_id]=="XY-MINREG":
                doc[start]._.alt_text = "99-min"

            if self.nlp.vocab.strings[match_id]=="XY_PROCENT":
                doc[start]._.alt_text = "99"
                doc[start+1]._.alt_text = "%"

            if self.nlp.vocab.strings[match_id]=="XY-RECEPTOR":
                doc[start]._.alt_text = "XY"
                doc[start+1]._.alt_text = "-"
                doc[start+2]._.alt_text = "receptor"
            if self.nlp.vocab.strings[match_id]=="XY_RECEPTOR":
                doc[start]._.alt_text = "XY"
                doc[start+1]._.alt_text = "receptor"


# {"label":"REFS","pattern":[{"TEXT": "AuthorsEtAl"}]}
# {"label":"REFx","pattern":[{"TEXT": "AuthorEtAl"}]}

#            doc[start]._.alt_text = doc[start].text + " " + self.nlp.vocab.strings[match_id] + " gefunden"
#        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]

        return doc   

    def custom_matcher_alt_text(self):
        pattern_file = self.pattern_file_custom_matcher_alt_text
        (ents, pattern) = self.read_pattern_matcher_file(pattern_file)
        for i in range(len(ents)-1):

            self.matcher_alt_text.add(ents[i], None, pattern[i])
 #           self.matcher.add(entity, None, *phrases)
        # pattern = []
        # pattern.append([{'IS_DIGIT': True}, {'LOWER':'ng'}])
        # pattern.append([{'IS_DIGIT': True}, {'LOWER':'mg'}])
        # self.matcher_units2.add('UNITS', None, *pattern)
    
    


    # diese Funktion soll den Text jedes Tokens setzen
    def custom_pipe_component_set_extension_unit_text(self, doc):
        # rufe den PhraseMatcher fuer die units auf
        #self.matcher_units2 = Matcher(self.nlp.vocab)
        self.add_pattern_jsonl_file_Phrasematcher("./Lib/units.jsonl")
        matches = self.matcher_units(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        for match_id, start, end in matches:
            doc[start]._.alt_text = doc[start].text + "_ unit gefunden"
#        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]

        return doc     


    def is_unit(self,token):
        return token.text == "mg"

    #def get_alt_text(self,token):
    #    return token._.alt_text




    def add_pattern_jsonl_file_Phrasematcher(self, pattern_file):
        (ents, unit_pattern) = self.read_gazetteer2(pattern_file)
        for i in range(len(ents)-1):
            #matcher_units.add("Units", None, *list(nlp.pipe(COUNTRIES)))
            self.matcher_units.add("UNITS", None, *list(self.nlp.pipe(unit_pattern)))
#            self.matcher_units.add(ents[i], None, pattern[i])
#           self.matcher.add(entity, None, *phrases)


    def read_gazetteer2(self, loc):
        pattern = []
        ents = []
        idx = 0
        for i, line in enumerate(open(loc)):
            idx +=1
            data = eval(line.strip())
            ents.append(data["label"])
            # ich fuege zum matcher das pattenr
            pattern.append(data["pattern"])
        return (ents, pattern)



    def read_pattern_matcher_file(self, loc):
        pattern = []
        ents = []
        for i, line in enumerate(open(loc)):
            data = eval(line.strip())
            ents.append(data["label"])
            pattern.append(data["pattern"])
        return (ents, pattern)

#*___________________________________________________________
#*___________________________________________________________
    #* Text Extraction von XML to txt     
    # Wandelt den Text von den XML Dokumenten in reinen Text um 
    #diese werden dann im self.output_dir gespeichert
    #
    def extract_text(self):
        idx = 0
        for file in os.listdir(self.xml_dir):
            print(f'schleife extract text with : {idx} ')
            if file.endswith('.xml'):
                input_filename = os.path.join(self.xml_dir, file)
                if len(self.section_names)==1:
                    prefix = self.section_names[0]
                else:
                    prefix = 'section_mix'

                output_filename = os.path.join(self.output_dir, prefix + '_' + file)
                print(output_filename)
                self.__current_xml_files_for_spacy_preprocessing.append(input_filename)

                with open(input_filename, "r", encoding="utf8") as f1:
                    print('-------------------------')
                    print('filename:' + input_filename)
                    xml = f1.read()
                    P = RP.Research_Paper_XMLJSON(xml, "json")
                    P.development_test()
                    #P.analyse_parsing()
                    rtext = ''
                    for section_name in self.section_names:
                        rtext = rtext + P.get_part_of_text(section_name)
                    #print(rtext)

                with open(output_filename,"w+", encoding="utf8") as f2:
                    self._TEXTS.append(rtext)
                    f2.write(rtext)
                idx += 1
            # ! This has to be removed in further versions    
            if idx > 10:
                break



    def get_sentence_alt_text(self, sent):
        # uebergabe eines doc objects /// sentence
        # rueckgabe eines TExtes das den alternativen TExt nutzt
        alt_text = ""
        sent_org_text = sent.text
        for token in sent:
            if token._.alt_text_keep:
                alt_text = alt_text + token._.alt_text + token._.alt_text_trailing_whitespace_
        return alt_text
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.lang.en import English

nlp = spacy.load("en_core_web_md")
animal_patterns = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
patterns = list(nlp.pipe(animal_patterns))
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMALS", None, *patterns)


# define custom component
def animal_component(doc):
    # create span for each match and assign label animal
    # and overwrite the doc.ents with the matched spans
    doc.ents = [
        Span(doc, start, end, label=nlp.vocab.strings["ANIMAL"])
        for match_id, start, end in matcher(doc)
    ]
    return doc


# add the component to the pipeline after the ner component
nlp.add_pipe(animal_component, after='ner')

# process the text and print the text and label for the doc.ents
doc = nlp('I have a cat and a Golden Retriever')
print([(ent.text, ent.label_) for ent in doc.ents])
Beispiel #22
0
 def __get_country_matcher__(self):
     matcher = PhraseMatcher(self.nlp.vocab)
     countries = ['Czech Republic', 'Australia', 'Germany', 'Slovakia']
     patterns = list(self.nlp.pipe(countries))
     matcher.add('ANIMAL', None, *patterns)
     return matcher
Beispiel #23
0
 def __get_animal_matcher__(self):
     matcher = PhraseMatcher(self.nlp.vocab)
     animals = ['dog', 'cat', 'mouse', 'dogs', 'cats', 'mice']
     patterns = list(self.nlp.pipe(animals))
     matcher.add('ANIMAL', None, *patterns)
     return matcher
Beispiel #24
0
import spacy
from spacy.language import Language
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("pt_core_news_sm")
animals = ["Golden Retriever", "gato", "tartaruga", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", animal_patterns)

# Definir o componente customizado
@Language.component("animal_component")
def animal_component_function(doc):
    # Aplicar o matcher ao doc
    matches = matcher(doc)
    # Criar uma partição para cada correspondência e atribuir o rótulo "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Sobrescrever doc.ents com as correspondências 
    doc.ents = spans
    return doc


# Adicionar o componente ao fluxo de processamento após o componente "ner"
nlp.add_pipe("animal_component", after="ner")
print(nlp.pipe_names)

# Processar o texto e imprimir o texto e rótulo de doc.ents
doc = nlp("Eu tenho um gato e um Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])
def get_validator_matches(text):
    match_ents.clear()

    matcher = Matcher(nlp.vocab)
    phraseMatcher = PhraseMatcher(nlp.vocab, attr="LEMMA")

    adverbPattern = [{"POS": "ADV"}]
    matcher.add("Adverbs", match_adverb, adverbPattern)

    adjectivePattern = [{"POS": "ADJ"}]
    matcher.add("Adjectives", match_adjective, adjectivePattern)

    pastTenseVerbPattern1 = [{"TAG": "VBD"}]
    pastTenseVerbPattern2 = [{"TAG": "VBN"}]
    matcher.add("Passive Voice", match_passive, pastTenseVerbPattern1,
                pastTenseVerbPattern2)

    infinitivePattern1 = [{"LOWER": "be"}, {"POS": "ADJ"}, {"POS": "ADP"}]
    infinitivePattern2 = [{"LOWER": "to"}, {"POS": "VERB"}]
    matcher.add("Infinitive", match_infinitive, infinitivePattern1,
                infinitivePattern2)

    pronounPattern = [{"POS": "PRON"}]
    matcher.add("Pronoun", match_pronoun, pronounPattern)

    indefiniteArticles = ["a", "an"]
    indefiniteArticlePatterns = [nlp(text) for text in indefiniteArticles]
    phraseMatcher.add("Indefinite Articles", match_indefinite_articles,
                      *indefiniteArticlePatterns)

    vagueTerms = [
        "some", "any", "allowable", "several", "many", "lot of", "a few",
        "almost always", "very nearly", "nearly", "about", "close to",
        "almost", "approximate"
    ]
    vagueTermsPatterns = [nlp(text) for text in vagueTerms]
    phraseMatcher.add("Vague Terms", match_vague_terms, *vagueTermsPatterns)

    escapeClauses = [
        "so far as is possible", "as possible", "as little as possible",
        "where possible", "as much as possible",
        "if it should prove necessary", "if necessary",
        "to the extent necessary", "as appropriate", "as required",
        "to the extent practical", "if practicable"
    ]
    escapeClausesPatterns = [nlp(text) for text in escapeClauses]
    phraseMatcher.add("Escape Clauses", match_escape_clauses,
                      *escapeClausesPatterns)

    openEndedClauses = ["including but not limitedd to", "etc", "and so on"]
    openEndedPatterns = [nlp(text) for text in openEndedClauses]
    phraseMatcher.add("Open Ended Clauses", match_open_ended_clauses,
                      *openEndedPatterns)

    notTerms = ["not"]
    notPatterns = [nlp(text) for text in notTerms]
    phraseMatcher.add("Negations", match_negations, *notPatterns)

    universalQuantifiers = [
        "all", "any", "both", "completely", "prompt", "fast", "minimum",
        "maximum", "optimum"
    ]
    universalPatterns = [nlp(text) for text in universalQuantifiers]
    phraseMatcher.add("Immeasurable Quantifiers", match_universal_quantifier,
                      *universalPatterns)

    temporalDependencies = [
        "eventually", "before", "when", "after", "as", "once", "earliest",
        "latest", "instantaneous", "simultaneous", "while", "at last"
    ]
    temporalPatterns = [nlp(text) for text in temporalDependencies]
    phraseMatcher.add("Temporal Dependencies", match_temporal,
                      *temporalPatterns)

    doc = nlp(inputText)
    matches = matcher(doc)
    lowercaseDoc = nlp(inputText.lower())
    phraseMatches = phraseMatcher(lowercaseDoc)
    match_ents.sort(key=lambda x: x["start"])
    return match_ents
Beispiel #26
0
import io
import re
import spacy

nlp = spacy.load('en_core_web_sm')

from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher

phrase_matcher = PhraseMatcher(nlp.vocab)
matcher = Matcher(nlp.vocab)
matcher1 = Matcher(nlp.vocab)


def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)


def most_common(lst):
    return (max(lst, key=lst.count))


def listToString(s):
    str1 = " "
    return (str1.join(s))


def extract_full_name(nlp_doc, no_of_word):
    names = []
    if no_of_word >= 3:
        pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
Beispiel #27
0
 def __init__(self, nlp, terms, label):
     patterns = [nlp(term) for term in terms]
     self.matcher = PhraseMatcher(nlp.vocab)
     self.matcher.add(label, None, *patterns)
Beispiel #28
0
class Sectionizer:
    name = "sectionizer"

    def __init__(self,
                 nlp,
                 patterns="default",
                 add_attrs=False,
                 max_scope=None):
        self.nlp = nlp
        self.add_attrs = add_attrs
        self.matcher = Matcher(nlp.vocab)
        self.max_scope = max_scope
        self.phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        self.assertion_attributes_mapping = None
        self._patterns = []
        self._section_titles = set()

        if patterns is not None:
            if patterns == "default":
                import os
                if not os.path.exists(DEFAULT_RULES_FILEPATH):
                    raise FileNotFoundError(
                        "The expected location of the default patterns file cannot be found. Please either "
                        "add patterns manually or add a jsonl file to the following location: ",
                        DEFAULT_RULES_FILEPATH)
                self.add(self.load_patterns_from_jsonl(DEFAULT_RULES_FILEPATH))
            # If a list, add each of the patterns in the list
            elif isinstance(patterns, list):
                self.add(patterns)
            elif isinstance(patterns, str):
                import os
                assert os.path.exists(patterns)
                self.add(self.load_patterns_from_jsonl(patterns))

        if add_attrs is False:
            self.add_attrs = False
        elif add_attrs is True:
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif isinstance(add_attrs, dict):
            # Check that each of the attributes being added has been set
            for modifier in add_attrs.keys():
                attr_dict = add_attrs[modifier]
                for attr_name, attr_value in attr_dict.items():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            "Custom extension {0} has not been set. Call Span.set_extension."
                        )

            self.add_attrs = True
            self.assertion_attributes_mapping = add_attrs

        else:
            raise ValueError(
                "add_attrs must be either True (default), False, or a dictionary, not {0}"
                .format(add_attrs))

    @property
    def patterns(self):
        return self._patterns

    @property
    def section_titles(self):
        return self._section_titles

    @classmethod
    def load_patterns_from_jsonl(self, filepath):

        import json
        patterns = []
        with open(filepath) as f:
            for line in f:
                if line.startswith("//"):
                    continue
                patterns.append(json.loads(line))

        return patterns

    def register_default_attributes(self):
        """Register the default values for the Span attributes defined in DEFAULT_ATTRS."""
        for attr_name in [
                "is_negated",
                "is_uncertain",
                "is_historical",
                "is_hypothetical",
                "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def add(self, patterns):
        """Add a list of patterns to the clinical_sectionizer. Each pattern should be a dictionary with
       two keys:
           'section': The normalized section name of the section, such as 'pmh'.
           'pattern': The spaCy pattern matching a span of text.
               Either a string for exact matching (case insensitive)
               or a list of dicts.

       Example:
       >>> patterns = [ \
           {"section_title": "past_medical_history", "pattern": "pmh"}\
           {"section_title": "past_medical_history", "pattern": [{"LOWER": "past", "OP": "?"}, \
               {"LOWER": "medical"}, \
               {"LOWER": "history"}]\
               },\
           {"section_title": "assessment_and_plan", "pattern": "a/p:"}\
           ]
       >>> clinical_sectionizer.add(patterns)
       """
        for pattern_dict in patterns:
            name = pattern_dict["section_title"]
            pattern = pattern_dict["pattern"]
            if isinstance(pattern, str):
                self.phrase_matcher.add(name, None, self.nlp.make_doc(pattern))
            else:
                self.matcher.add(name, [pattern])
            self._patterns.append(pattern_dict)
            self._section_titles.add(name)

    def set_assertion_attributes(self, ents):
        """Add Span-level attributes to entities based on which section they occur in.

        Args:
            edges: the edges to modify

        """
        for ent in ents:
            if ent._.section_title in self.assertion_attributes_mapping:
                attr_dict = self.assertion_attributes_mapping[
                    ent._.section_title]
                for (attr_name, attr_value) in attr_dict.items():
                    setattr(ent._, attr_name, attr_value)

    def __call__(self, doc):
        matches = self.matcher(doc)
        matches += self.phrase_matcher(doc)
        matches = prune_overlapping_matches(matches)
        if len(matches) == 0:
            doc._.sections.append((None, None, doc[0:]))
            return doc

        first_match = matches[0]
        section_spans = []
        if first_match[1] != 0:
            section_spans.append((None, None, doc[0:first_match[1]]))
        for i, match in enumerate(matches):
            (match_id, start, end) = match
            section_header = doc[start:end]
            name = self.nlp.vocab.strings[match_id]
            # If this is the last match, it should include the rest of the doc
            if i == len(matches) - 1:
                if self.max_scope is None:
                    section_spans.append((name, section_header, doc[start:]))
                else:

                    section_spans.append((name, section_header,
                                          doc[start:end + self.max_scope]))
            # Otherwise, go until the next section header
            else:
                next_match = matches[i + 1]
                _, next_start, _ = next_match
                if self.max_scope is None:
                    section_spans.append(
                        (name, section_header, doc[start:next_start]))
                else:
                    section_spans.append((name, section_header,
                                          doc[start:end + self.max_scope]))

        for name, header, section in section_spans:
            doc._.sections.append((name, header, section))
            for token in section:
                token._.section_span = section
                token._.section_title = name
                token._.section_header = header

        # If it is specified to add assertion attributes,
        # iterate through the entities in doc and add them
        if self.add_attrs is True:
            self.set_assertion_attributes(doc.ents)
        return doc
Beispiel #29
0
#
# show_ents(doc3)

# from spacy.tokens import Span
# doc = nlp(u'Tesla to build a BR factory for alot of money')
# ORG = doc.vocab.strings[u'ORG']
# print(ORG)
#
# print(doc.ents)

from spacy.tokens import Span
doc = nlp(u'Our company created a brand new vacuum cleaner This new vacuum-cleaner is the best in show'
       u'This new vacuum-cleaner is the best in show')
show_ents(doc)
from spacy.matcher import PhraseMatcher
encontrador = PhraseMatcher(nlp.vocab)
lista_frase = ['vacuum cleaner', 'vacuum-cleaner']
padroes_frase = [nlp(text) for text in lista_frase]
encontrador.add('novoproduto', None, *padroes_frase)
found_matches = encontrador(doc)
print(found_matches)
from spacy.tokens import Span
PROD = doc.vocab.strings[u'PRODUCT']#esse product é a tag da lista de tags que voce atribui para as palavras que quer adicionar
print(found_matches)
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]#aquei nos vamos atribuir a a match1 que eh o atributo 2 e o match2 que eh o terceiro atributo , sendo respectivamente onde começa e termina a palvra que queremos adicionar
doc.ents = list(doc.ents) + new_ents
show_ents(doc)
doc_encontra = nlp(u'Originally I paid $29.95 for this card, but now this card is much more expencive. It is now 50 dollars')
test = len([ent for ent in doc_encontra.ents if ent.label_ == 'MONEY'])
print(test)
class MultiExtractorComponent(object):
    def __init__(self, nlp, ontoDict):
        # add ontology and label from ontoDict
        self.ontoDict = ontoDict
        self.all_labels = ""

        # stop words, don't try to match these
        stopwords = nlp.Defaults.stop_words
        stopwords.add("ands")
        stopwords.add("ends")
        stopwords.add("ci")

        self.ontols = []

        ontologies = ontoDict["ontologies"]
        for ontology in ontologies:
            for key, value in ontology.items():
                if (key == "label"):
                    self.all_labels = self.all_labels + value
                if (key == "ontology"):
                    self.ontols.append(value)
        # print("self.ontols: ", self.ontols)
        # for x in self.ontols:
        #     print("got x: ", x)
        # print("all_labels = ", self.all_labels)

        # for making plural forms of labels for text matching
        engine = inflect.engine()

        # init terms and patterns
        self.terms = {}
        patterns = []

        #build unified table of all ID, IRI, Label and Synonyms:
        for ontol in self.ontols:  #should be all ontols in
            print("checking ontol: ", ontol)
            for termid in ontol.get_classes():
                # print("k is: ", k)
                termshortid = ontol.get_id_for_iri(termid)

                label = ontol.get_annotation(termid, RDFSLABEL)
                definition = ontol.get_annotation(termid, DEFINITION)
                if label:
                    term_entry = {
                        'id': termid if termshortid is None else termshortid,
                        'name': label.strip(),
                        'definition': definition
                    }
                if label is not None and label.strip().lower(
                ) not in stopwords:
                    self.terms[label.strip().lower()] = term_entry
                    patterns.append(nlp.make_doc(label.strip().lower()))
                    plural = engine.plural(label.strip())
                    self.terms[plural.lower()] = term_entry
                    patterns.append(nlp.make_doc(plural.lower()))
                synonyms = ontol.get_annotations(termid, SYN)
                for s in synonyms:
                    # print("adding SYNONYM in ontotagtext: ", s)
                    if s.strip().lower() not in stopwords:
                        self.terms[s.strip().lower()] = term_entry
                        patterns.append(nlp.make_doc(s.strip().lower()))
                        try:
                            plural = engine.plural(s.strip().lower())
                            self.terms[plural.lower()] = term_entry
                            patterns.append(nlp.make_doc(plural.lower()))
                        except:
                            print("Problem getting plural of ", s)
                            continue

        # initialize matcher and add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(self.all_labels, None, *patterns)

        # set extensions to tokens, spans and docs
        Token.set_extension("is_ontol_term", default=False, force=True)
        Token.set_extension("ontol_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_ontols", getter=self.has_ontols, force=True)
        Doc.set_extension("ontols", default=[], force=True)
        Span.set_extension("has_ontols", getter=self.has_ontols, force=True)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [
            Span(doc, match[1], match[2], label=self.all_labels)
            for match in matches
        ]
        for i, span in enumerate(spans):
            span._.set("has_ontols", True)
            for token in span:
                if span.text.lower() in self.terms:
                    token._.set("is_ontol_term", True)
                    token._.set("ontol_id",
                                self.terms[span.text.lower()]["id"])
                else:
                    print("Term not found: ", span.text.lower())

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.ontols = list(doc._.ontols) + [span]

        return doc

    # getter function for doc level
    def has_ontols(self, tokens):
        return any([t._.get("is_ontol_term") for t in tokens])

    def get_term(self, term_id):
        # print("getting term")
        if term_id in [v['id'] for v in self.terms.values()]:
            keys = [
                k for k, v in self.terms.items()
                if v['id'].strip() == term_id.strip()
            ]
            return self.terms[keys[0]]
        else:
            return None

    def get_label(self, label):
        # print("getting label")
        if label.strip().lower() in [
                v['name'].strip().lower() for v in self.terms.values()
        ]:
            keys = [
                k for k, v in self.terms.items()
                if v['name'].strip().lower() == label.strip().lower()
            ]
            return self.terms[keys[0]]
        else:
            return None
print("Please wait whilst spaCy language library is loaded...")
nlp = spacy.load('en_core_web_md')
"""
//////////////////////////////////////////////////////
Change global values for bad words here
//////////////////////////////////////////////////////
"""
BAD_STEM_WORDS_LIST = [
    "you", "option", "accurate", "correct", "true", "can be", "only",
    "statement"
]
BAD_OPTION_WORDS_LIST = ["only", "statement", "all of the above"]

# Create spaCy PhraseMatchers (lowercase for case-insensitivity)
dnd_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
dnd_term = ["Drag and drop the"]
dnd_patterns = [nlp.make_doc(text) for text in dnd_term]
dnd_matcher.add("TerminologyList", None, *dnd_patterns)

canbe_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
canbe_term = ["can be"]
canbe_patterns = [nlp.make_doc(text) for text in canbe_term]
canbe_matcher.add("TerminologyList", None, *canbe_patterns)

negative_matcher = Matcher(nlp.vocab)
negative_matcher.add("NegativeList", None, [{
    'POS': 'VERB'
}, {
    'DEP': 'neg'
}], [{
def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
Beispiel #33
0
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json

with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())
with open("exercises/en/country_text.txt", encoding="utf8") as f:
    TEXT = f.read()

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

# Create a doc and reset existing entities
doc = nlp(TEXT)
doc.ents = []

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = ____(____, ____, ____, label=____)

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [____]

    # Get the span's root head token
    span_root_head = ____.____.____
    # Print the text of the span root's head token and the span text
    print(span_root_head.____, "-->", span.text)
Beispiel #34
0
class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """
    name = 'rest_countries'  # component name, will show up in the pipeline

    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set('is_country', True)
                token._.set('country_capital',
                            self.countries[entity.text]['capital'])
                token._.set('country_latlng',
                            self.countries[entity.text]['latlng'])
                token._.set('country_flag',
                            self.countries[entity.text]['flag'])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_country') for t in tokens])
Beispiel #35
0
    python -m spacy download en
'''
import spacy
nlp = spacy.load('en')  # load text object
# doc = nlp("Tea is healthy and calming, don't you think?")   # doc is a document object, containing tokens
# 遍历输出doc
# for token in doc:
#     print(token)

# # 检查lemma, stopword
# print("Token\t\tLemma\t\tStopword")
# print('-'*40)
# for token in doc:
#     print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')    # 构建模型
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google pixel']
patterns = [nlp(text) for text in terms]    # 为要匹配的词构建模型
matcher.add("TerminologyList", patterns)    # 给matcher增加一个属性,该属性的ID为TermologyList

text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.")   # 待匹配的文本

matches = matcher(text_doc)     #  用matcher规则来找text_doc中的内容

print(matches)
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])
def test_attr_pipeline_checks(en_vocab):
    doc1 = Doc(en_vocab, words=["Test"])
    doc1[0].dep_ = "ROOT"
    doc2 = Doc(en_vocab, words=["Test"])
    doc2[0].tag_ = "TAG"
    doc2[0].pos_ = "X"
    doc2[0].set_morph("Feat=Val")
    doc2[0].lemma_ = "LEMMA"
    doc3 = Doc(en_vocab, words=["Test"])
    # DEP requires DEP
    matcher = PhraseMatcher(en_vocab, attr="DEP")
    matcher.add("TEST1", [doc1])
    with pytest.raises(ValueError):
        matcher.add("TEST2", [doc2])
    with pytest.raises(ValueError):
        matcher.add("TEST3", [doc3])
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = PhraseMatcher(en_vocab, attr=attr)
        matcher.add("TEST2", [doc2])
        with pytest.raises(ValueError):
            matcher.add("TEST1", [doc1])
        with pytest.raises(ValueError):
            matcher.add("TEST3", [doc3])
    # TEXT/ORTH only require tokens
    matcher = PhraseMatcher(en_vocab, attr="ORTH")
    matcher.add("TEST3", [doc3])
    matcher = PhraseMatcher(en_vocab, attr="TEXT")
    matcher.add("TEST3", [doc3])
Beispiel #37
0
# load pre-trained model
base_path = os.path.dirname(__file__)


nlp = spacy.load('en_core_web_sm')
custom_nlp2 = spacy.load(os.path.join(base_path,"degree","model"))
custom_nlp3 = spacy.load(os.path.join(base_path,"company_working","model"))

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

file = os.path.join(base_path,"titles_combined.txt")
file = open(file, "r", encoding='utf-8')
designation = [line.strip().lower() for line in file]
designitionmatcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(text) for text in designation if len(nlp.make_doc(text)) < 10]
designitionmatcher.add("Job title", None, *patterns)

file = os.path.join(base_path,"LINKEDIN_SKILLS_ORIGINAL.txt")
file = open(file, "r", encoding='utf-8')    
skill = [line.strip().lower() for line in file]
skillsmatcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(text) for text in skill if len(nlp.make_doc(text)) < 10]
skillsmatcher.add("Job title", None, *patterns)


class resumeparse(object):

    objective = (
        'career goal',
Beispiel #38
0
class MatcherPipe(object):
    name = EXCELCY_MATCHER

    def __init__(self, nlp, patterns: list = None):
        """
        SpaCy pipe to match Entity based on multiple patterns.

        Pattern examples:
        patterns = [
            {'kind': 'phrase', 'value': 'amazon', 'entity': 'PRODUCT'},
            {'kind': 'regex', 'value': 'ama(.+)', 'entity': 'PRODUCT'}
        ]

        :param nlp: The NLP object
        :param patterns: The matcher patterns
        """
        self.nlp = nlp
        self.phrase_matcher = PhraseMatcher(nlp.vocab)
        self.matcher = Matcher(nlp.vocab)

        self.extra_patterns = []
        # start add pattern
        self.add_patterns(patterns=patterns or [])

    def add_patterns(self, patterns: list):
        """
        Add pattern list into matcher algo.

        :param patterns: List of pattern
        """
        for pattern in patterns:
            kind, value, entity = pattern.get('kind'), pattern.get(
                'value'), pattern.get('entity')
            self.add_pattern(kind=kind, value=value, entity=entity)

    def add_pattern(self, kind: str, value, entity: str):
        """
        Add pattern into matcher algorithm. There are two different types:
        - phrase: This uses PhraseMatcher which described in https://spacy.io/usage/linguistic-features#adding-phrase-patterns
        - regex: This uses Matcher which described in https://spacy.io/usage/linguistic-features#regex

        :param kind: Pattern matcher type, either 'phrase', 'regex'
        :param value: Entity pattern matcher
        :param entity: Entity to be matched
        """
        if kind == 'phrase':
            self.phrase_matcher.add(entity, None, *[self.nlp(value)])
        elif kind == 'regex':
            regex_flag = self.nlp.vocab.add_flag(
                lambda text: self.eval_regex(pattern=value, text=text))
            self.matcher.add(entity, None, [{regex_flag: True}])

    def eval_regex(self, pattern, text):
        return bool(re.compile(pattern).match(text))

    def __call__(self, doc: Doc):
        """
        The spacy pipeline caller
        :param doc: The Doc token.
        """

        # get matches
        phrase_matches = self.phrase_matcher(doc)
        matches = self.matcher(doc)

        # process them
        for match_id, start, end in phrase_matches + matches:
            # start add them into entities list
            entity = (match_id, start, end)
            doc.ents += (entity, )

        return doc
Beispiel #39
0
class NLP_Processer :

    # def __init__ (self, disease_pattern_loc = os.path.join("NLP_PhaseMatcher_version","disease_pattern.json") , search_pattern_loc = os.path.join("NLP_PhaseMatcher_version","search_pattern.json"), 
    #                 syndrome_pattern_loc = os.path.join("NLP_PhaseMatcher_version","syndrome_pattern.json"), disease_catogary_loc = os.path.join("NLP_PhaseMatcher_version","disease_catogary.json")):
    # Yahnis windows' version
    def __init__ (self, disease_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/disease_pattern.json") , search_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/search_pattern.json"), 
                    syndrome_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/syndrome_pattern.json"), disease_catogary_loc = Path(sourcepath,"NLP_PhaseMatcher_version/disease_catogary.json"), geocode_service = True):
        self.nlp = spacy.load('en_core_web_sm')
        self.matcher = PhraseMatcher(self.nlp.vocab, attr='LOWER', max_length=5)
        self.load_pattern(disease_pattern_loc)
        self.load_pattern(syndrome_pattern_loc)
        self.load_search_pattern(search_pattern_loc)
        self.location_checker = Location_Checker()
        self.publication_date = "2020-xx-xx xx:xx:xx"
        self.keyword_location = []
        self.keyword_frequency = []
        self.keyword_list = []
        self.disease_to_family_dic = {}
        self.family_to_syndrome_dic = {}
        self.load_dictionary(disease_catogary_loc)

    def set_publication_date(self, date) :
        self.publication_date = date

    def get_keyword_location(self) :
        return self.keyword_location
    
    def get_keyword_frequency(self) :
        return self.keyword_frequency
    
    def get_keyword_list(self) :
        return self.keyword_list

    def load_dictionary(self,location) :
        with open(location) as f:
            datas = json.load(f)
        f.close()

        for  data in datas:
            family = data["family"]
            for disease in data["disease"] :
                self.disease_to_family_dic[disease] = family
            self.family_to_syndrome_dic[family] = data["syndrome"]

    def load_pattern(self,location) :
        with open(location) as f:
            datas = json.load(f)
        f.close()

        for  data in datas:
            name = data["name"]
            general_names = data["general_names"]
            patterns = [self.nlp.make_doc(text) for text in general_names]
            self.matcher.add(name, None, *patterns)

    def load_search_pattern(self,location) :
        with open(location) as f:
            data = json.load(f)
        f.close()
        patterns = [self.nlp.make_doc(text) for text in data["keywords"]]
        self.matcher.add(data["name"], None, *patterns)
    
    def category_report(self, event_date, locations, diseases, syndromes) :
        syndrome_usage = {}
        diseases_not_captured = []
        disease_same_family = {}
        reports = []
        for syndrome in syndromes :
            syndrome_usage[syndrome] = False
        for disease in diseases :
            if disease in self.disease_to_family_dic.keys() :
                family_name = self.disease_to_family_dic[disease]
                if family_name in disease_same_family.keys() :
                    disease_same_family[family_name].append(disease)
                else :
                    disease_same_family[family_name] = [disease]
            else :
                diseases_not_captured.append(disease)
        for k, v in disease_same_family.items() :
            syndrome_list = []
            possible_sydromes = self.family_to_syndrome_dic[k]
            for syndrome in syndromes :
                if syndrome in possible_sydromes :
                    syndrome_list.append(syndrome)
                    syndrome_usage[syndrome] = True
            d = {}
            d["event_date"] = event_date
            d["locations"] = locations
            d["diseases"] = v
            d["syndromes"] = syndrome_list
            reports.append(d)
        if diseases_not_captured == [] :
            if reports == [] :
                print("weird article!")
                d = {}
                d["event_date"] = event_date
                d["locations"] = locations
                d["diseases"] = diseases
                d["syndromes"] = syndromes
            else :
                for k, v in syndrome_usage.items() :
                    if v == False :
                        reports[0]["syndromes"].append(k)
        else :
            d = {}
            d["event_date"] = event_date
            d["locations"] = locations
            d["diseases"] = diseases_not_captured
            syndrome_list = []
            for k, v in syndrome_usage.items() :
                    if v == False :
                        syndrome_list.append(k)
            d["syndromes"] = syndrome_list
            reports.append(d)
        return reports

        

    def make_reports(self, text) :
        doc = self.nlp(text)
        matches = self.matcher(doc)
        text_length = len([token.text for token in doc])
        disease_dic = {}
        syndrome_dic = {}
        search_dic = {}
        for match_id, start, end in matches:
            category = self.nlp.vocab.strings[match_id]
            span = doc[start:end]
            temp = re.search("^([A-Z]{3})-(.+)$",category)
            # print(category + "  " + span.text)
            if temp == None :
                if str(span).lower() in search_dic :
                    search_dic[str(span).lower()] +=1
                else :
                    search_dic[str(span).lower()] = 1
            elif temp.group(1) == "DIS" :
                if temp.group(2) in disease_dic :
                    disease_dic[temp.group(2)] +=1
                else :
                    disease_dic[temp.group(2)] = 1
            elif temp.group(1) == "SYN" :
                if temp.group(2) in syndrome_dic :
                    syndrome_dic[temp.group(2)] +=1
                else :
                    syndrome_dic[temp.group(2)] = 1
        # At this stage disease and syndrome parts are done
        temp = dict(disease_dic, **syndrome_dic)
        keyword_dic = dict(temp,**search_dic)
        keyword_dic = dict(sorted(keyword_dic.items(), key=lambda kv: kv[1], reverse=True))
        keyword_dic = dict((k.lower(), round(v/text_length,8)) for k,v in keyword_dic.items())
        #This is for date and location
        temp = re.search("^([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}|x{2}):([0-9]{2}|x{2}):([0-9]{2}|x{2})$", self.publication_date)
        if temp == None :
            print ("error error error nlp processer publication date!")
        else :
            test = Date_Formater(year = temp.group(1), month = int(temp.group(2)))
        country_dic = {}
        location_dic = {}
        for ent in doc.ents:
            text = ent.text
            if ent.label_ == "TIME" :
                test.add_time(text)
            elif ent.label_ == "DATE" or ent.label_ == "ORG" :
                test.add_date(text)
            elif ent.label_ == "GPE" :
                text = text.replace(".","")
                country = self.location_checker.get_country(text)
                if country == None :
                    temp = re.search(r"[0-9]|:|;|\(|\)|\"|\'|\\|\/|@|Discover|\`|\=|\+|\?|\!", text)
                    if temp == None and len(text) > 3:
                        if text in location_dic :
                            location_dic[text] += 1
                        else :
                            location_dic[text] = 1
                else :
                    if country in country_dic :
                        country_dic[country] += 1
                    else :
                        country_dic[country] = 1
        # convert to json
        location_handler = Geocode_Location()
        location_handler.load_locations_countires(sorted(location_dic.keys()), sorted(country_dic.keys()))
        event_date = test.get_event_date()
        locations = location_handler.get_locations()
        diseases = sorted(disease_dic.keys())
        syndromes = sorted(syndrome_dic.keys())
        self.keyword_location = sorted(location_handler.get_location_keywords())
        self.keyword_frequency = keyword_dic
        temp = sorted(self.keyword_frequency.keys())
        for a in temp :
            self.keyword_list.append(a)
        return self.category_report(event_date, locations, diseases, syndromes)