def test_usage_multiple_components(nlp):
    entity1 = Entity(nlp, keywords_list=keyword_list(), label='ACME_1')
    nlp.add_pipe(entity1, first=False, name='entity1')
    entity2 = Entity(nlp, keywords_dict=keyword_dict(), label='ACME_2')
    nlp.add_pipe(entity2, first=False, name='entity2')
    doc = nlp(u"I am a product manager for a java_2e platform and python.")
    assert doc._.has_entities
    assert len(doc._.entities) == 3
    assert doc[:8]._.has_entities
    assert doc[3]._.entity_desc == 'product manager'
    assert len(doc[:4]._.entities) == 1
    assert doc[6]._.entity_desc == 'java_2e'
    assert doc[9]._.entity_desc == 'python'
 def __init__(self, entities: list, remove_stopwords: bool = True):
     self.spacyload = spacy.load('en_core_web_sm')
     self.ext = Entity(keywords_list=entities)
     self.spacyload.add_pipe(self.ext, last=True)
     self.error_text = 0
     self.no_stopwords = remove_stopwords
     self.stopwords = set(stopwords.words('english'))
def test_usage_no_entity(nlp):
    entity = Entity(nlp, keywords_list=keyword_list(), label='ACME')
    nlp.add_pipe(entity, last=True)
    doc = nlp(u"This is a sentence without entities.")
    assert not doc._.has_entities
    for token in doc:
        assert not token._.is_entity
def test_usage_multiple_entities_from_list(nlp):
    entity = Entity(nlp, keywords_list=keyword_list(), label='ACME')
    nlp.add_pipe(entity, last=True)
    doc = nlp(u"I am a product manager for a java platform and python.")
    assert doc._.has_entities
    assert len(doc._.entities) == 2
    assert doc[:8]._.has_entities
    assert len(doc[:8]._.entities) == 1
Exemple #5
0
    def create_ner_detector(file_directory):
        nlp = spacy.blank('en')
        with open(file_directory) as f:
            reference_data = f.read().splitlines()

        entity = Entity(keywords_list=reference_data)
        nlp.add_pipe(entity)
        return nlp
def test_usage_multiple_entities_from_dict(nlp):
    entity = Entity(nlp, keywords_dict=keyword_dict(), label='ACME')
    nlp.add_pipe(entity, last=True)
    doc = nlp(u"I am a product manager for a java_2e platform and python.")
    assert doc._.has_entities
    assert len(doc._.entities) == 2
    assert doc[:8]._.has_entities
    assert doc[3]._.entity_desc == 'product manager'
    assert len(doc[:4]._.entities) == 1
    assert doc[6]._.entity_desc == 'java_2e'
Exemple #7
0
def test_usage_entities_from_file(nlp, file_name, keyword_dict):
    keyword_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)
    entity = Entity(keywords_file=keyword_file, keywords_dict=keyword_dict, label='ACME')
    nlp.add_pipe(entity, last=True)
    doc = nlp(u"I am a product manager for a java_2e platform and python.")
    assert doc._.has_entities
    assert len(doc._.entities) == 2
    assert doc[:8]._.has_entities
    assert doc[3]._.entity_desc == 'product manager'
    assert len(doc[:4]._.entities) == 1
    assert doc[6]._.entity_desc == 'java_2e platform'
def extract_tags(query):
    with open('tags.txt', 'rb') as fp:
        tags = pickle.load(fp)
        entity = Entity(keywords_list=tags)
        nlp.add_pipe(entity, name="entity", last=True)
        doc = nlp(query)
        tags_taken, tag_indices = [], []
        for name, index, named in doc._.entities:
            tag_indices.append(index)
            tags_taken.append(name)
        nlp.remove_pipe("entity")
        return tags_taken, tag_indices
def extractTags():
    query = request.args.get('query')
    with open('tags.txt', 'rb') as fp:
        tags = pickle.load(fp)
        nlp = spacy.load('en')
        entity = Entity(keywords_list=tags)
        nlp.add_pipe(entity, last=True)
        # txt = unicode(query,encoding="utf-8")
        doc = nlp(query)
        tagstaken = []
        for name, index, named in doc._.entities:
            tagstaken.append(name)
        response = {"data": tagstaken}
        return json.dumps(response)
Exemple #10
0
    def __init__(self, email: str, outprefix: str):
        Entrez.email = email
        self.paths = PhenoXPaths(outprefix)
        self.pmid_abstracts = dict()
        # disease and human phenotype NER
        self.pmid_dner = {}
        # raw entity text
        self.pmid_ent_text = {}
        self.dner_cluster = {}
        self.total_dner = []

        self.nlp = spacy.load('en')
        self.id2kw = pickle.load(
            open(os.path.join(self.paths.data_dir, 'id2kw_dict.pkl'), 'rb'))
        self.kw2id = pickle.load(
            open(os.path.join(self.paths.data_dir, 'kw2id_dict.pkl'), 'rb'))
        entity = Entity(keywords_list=list(self.kw2id.keys()), label='DO/HPO')
        self.nlp.add_pipe(entity, last=True)

        # read synonyms from HGNC
        with open(os.path.join(self.paths.data_dir, 'hgnc_synonyms.json'),
                  'r') as f:
            hgnc_syn = f.read()
            self.hgnc = json.loads(hgnc_syn)
import spacy
from spacy.matcher import PhraseMatcher
from spacy_lookup import Entity
from spacy.lang.es import Spanish

nlp = Spanish()
entity = Entity(nlp,
                keywords_list=['pera en Dulce', 'manzana', 'tentacion'],
                label='FOOD')
nlp.add_pipe(entity, name='Food')
entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS')
nlp.add_pipe(entity2, name='FoodHashtags')
text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación."
doc = nlp(text)
for e in doc:
    print(e.text, e._.is_entity, e.ent_type_)
Exemple #12
0
def main():
    #change input file name here
    file = "input.txt"
    outputfile = file[1:-4] + ".json"

    content = None
    try:
        f = open(file, "r")
        content = f.read()
    except UnicodeDecodeError:
        f = open(file, "r", encoding='utf8')
        content = f.read()

    # Create a dataframe from csv
    df = pd.read_csv('./titles.csv')

    # User list comprehension to create a list of lists from Dataframe rows
    job_titles = [row[0] for row in df.values]

    # nlp.remove_pipe('entity')
    entity = Entity(keywords_list=job_titles, label='Job-Title')

    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(entity, before='ner')
    neuralcoref.add_to_pipe(nlp)
    doc = nlp(content)

    #coreference resolution is done in the whole document
    content = doc._.coref_resolved

    #Extacting the sentences from the text document
    sentences = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences.extend(tokenizer.tokenize(content))

    output = {}
    output["document"] = file
    output["extraction"] = []
    for sentence in sentences:
        sen = nlp(sentence)

        #Find part  template
        locations_count = 0
        for ents in sen.ents:
            if ents.label_ == 'GPE' or ents.label_ == 'LOC':
                locations_count = locations_count + 1
        if locations_count >= 2:
            resultset = parse_part_template(sen)
            for x in resultset:
                if x[0] != '' and x[1] != '':
                    part_temp = {}
                    part_temp["template"] = "PART"
                    part_temp["sentences"] = []
                    part_temp["sentences"].append(sentence)
                    part_temp["arguments"] = {}
                    part_temp["arguments"]["1"] = x[0]
                    part_temp["arguments"]["2"] = x[1]
                    output["extraction"].append(part_temp)

        #Find work template
        isJobTitleGiven = False
        isOrgGiven = False
        for ents in sen.ents:

            if ents.label_ == 'Job-Title':
                isJobTitleGiven = True
            if ents.label_ == 'ORG':
                isOrgGiven = True
        if isOrgGiven and isJobTitleGiven:
            results = parse_work_template(sen)
            for res in results:
                if res[0] != '' and res[1] != '' and res[2] != '':
                    work_temp = {}
                    work_temp["template"] = "WORK"
                    work_temp["sentences"] = []
                    work_temp["sentences"].append(sentence)
                    work_temp["arguments"] = {}
                    work_temp["arguments"]["1"] = res[0]
                    work_temp["arguments"]["2"] = res[2]
                    work_temp["arguments"]["3"] = res[1]
                    work_temp["arguments"]["4"] = res[3]
                    output["extraction"].append(work_temp)

        #Find buy template
        for i, tok in enumerate(sen):
            if tok.dep_ == 'ROOT' or tok.pos_ == 'VERB':
                if tok.text.lower() in ['buy', 'bought','shop', 'acquire', 'acquired', 'purchase', 'invest in'\
                    'invested', 'get', 'obtain', 'obtained', 'secure', 'redeem', 'land', 'spent', 'get']:
                    results = parse_buy_template(sen)
                    if results[0] != '' and results[1] != '':
                        buy_temp = {}
                        buy_temp["template"] = "BUY"
                        buy_temp["sentences"] = []
                        buy_temp["sentences"].append(sentence)
                        buy_temp["arguments"] = {}
                        buy_temp["arguments"]["1"] = results[0]
                        buy_temp["arguments"]["2"] = results[1]
                        buy_temp["arguments"]["3"] = results[2]
                        buy_temp["arguments"]["4"] = results[3]
                        buy_temp["arguments"]["5"] = results[4]
                        output["extraction"].append(buy_temp)

    # Serializing json
    json_object = json.dumps(output, indent=4)
    # Writing to sample.json
    with open(outputfile, 'w+') as f:
        f.write(json_object)
Exemple #13
0
def modify_nlp_pipe(nlp, taxon_names, label):
    entity = Entity(keywords_list=taxon_names, label=label)
    nlp.add_pipe(entity)
    nlp.remove_pipe("ner")
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import spacy
from spacy.lang.es.examples import sentences
from spacy_lookup import Entity

nlp = spacy.load('es_core_news_sm')
entity = Entity(keywords_list=[
    'invico', 'victoria', 'montelibano', 'san', 'luis', 'nicolas', 'utp',
    'dosquebradas'
])
nlp.add_pipe(entity, last=True)  # add this entity al pipeline

#doc = nlp('notas de Yerson victoria')
doc = nlp(u'ruta entre san nicolas y  luis')
#print(doc.text)

#nouns = dict() #diccionario para almecenar los sustantivos
recorrido = dict()
for token in doc:
    print("categorizacion: {} {} {}".format(token.text, token.pos_,
                                            token.dep_))  #
    #print(token.is_stop) #para saber si el token es de parada ( words que se repiten mucho )
    if token._.is_entity:
        recorrido[token.text] = token.pos_

print(recorrido)
#print(recorrido.keys())
#print(recorrido.values())
Exemple #15
0
  dictionary=[]
  for index, row in df_clean.iterrows():                     
    if (type(row[2]) == str):                     #Avoid numbers
      #print(row[2].lower())
      dictionary.append(row[2].lower())          #language_index : 1 spanish and 2 english  ?
  #print(dictionary)                               #IOPub data rate exceeded
  #print('length of the dictionary loaded: ', len(dictionary))
  return(dictionary)


##English
print("---Load dictionary english")
diagnostics_dictionary = load_dictionary(df_codeD, language_index=2)

#Add Named Entities metadata to Doc objects in Spacy. First, we load the model and replace the NER module with de entity diagnosticEnt. We do this to avoid overlapping of entities. Then, we also add procedureEnt.
diagnosticEnt = Entity(keywords_list=diagnostics_dictionary,label="DIAGNOSTICO")  #Detect Named Entities using dictionaries. We can process a text and show its entities.


nlp.replace_pipe("ner", diagnosticEnt)   #We replace the common entities with diagnostics
print('entities loaded in nlp')


def Dict_code_reference(df):    #Make a dictionary {'A00.0': 'Cólera debido a Vibrio cholerae 01, biotipo cholerae ', 'A00.1': 'Cólera debido a Vibrio cholerae 01, biotipo El Tor',...
  df_clean = df.dropna()     #drop NaN values
  code_list = []
  reference_list = []

  for index, row in df_clean.iterrows():
    if (row[2]!=None):
      code_list.append(row[0])
      reference_list.append(row[2])    #1 spanish, 2 english    ?
Exemple #16
0
    def add_service_entities(self):
        service_data = self.gen_service_tags_df[self.gen_service_tags_df['Type'] == 1]
        service_list = service_data['Tag'].tolist()

        service_entity = Entity(keywords_list=service_list, label='GEN_SERVICE')
        self.nlp.add_pipe(service_entity, last=True)
Exemple #17
0
Iterates through a bunch of docs and creates a list of GOV_ORG entities for each doc.
Usage:
    python3 govorg_matcher.py df
"""

import get_govorg_list
import spacy
from spacy_lookup import Entity

# stuff to run always here such as class/def
# if we call the function after import, need nlp defined
nlp = spacy.load('en_core_web_sm')
# get lookup
GOV_ORG = get_govorg_list.main()
# Create custom entity list
entity = Entity(keywords_list=GOV_ORG, label='GOV_ORG')
# happens after other processing steps
nlp.add_pipe(entity, last=True)


def text_gov_org_match(text):
    """Return a list of GOV_ORG entities found at least once in a str."""
    doc = nlp(text)
    # remove any duplicates because dictionaries cannot have duplicate keys
    list_of_gov_org_entities_with_duplicates = [
        token._.canonical for token in doc if token._.is_entity
    ]
    list_no_duplicates = list(
        dict.fromkeys(list_of_gov_org_entities_with_duplicates))
    return list_no_duplicates
def test_integration(nlp):
    entity = Entity(nlp)
    nlp.add_pipe(entity, last=True)
    assert nlp.pipe_names[-1] == 'entity'
Exemple #19
0
def create_species_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=species_dict, label="SPECIES")
Exemple #20
0
def fast_spacy(fast_df):
    entity = Entity(keywords_dict=generate_labels(fast_df), label="FAST_TOPIC")
    nlp = spacy.load("en_core_web_md")
    nlp.add_pipe(entity)
    nlp.remove_pipe("ner")
    return nlp, entity
Exemple #21
0
def create_location_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=location_dict, label="LOCATION")
Exemple #22
0
def create_habitat_entity(nlp: Language, name: str):
    habitats_list = list(habitats.Habitat)
    return Entity(name=name, keywords_list=habitats_list, label="HABITAT")