def regex(self, value): if value is not None: if type(value) == list or type(value) == dict: self._ruler = EntityRuler(self.model) self._ruler.add_patterns(value) else: self._ruler = EntityRuler(self.model).from_disk(value) self.model.add_pipe(self._ruler, before='ner')
def init_entity_ruler(nlp, patterns, label): string_store = nlp.vocab.strings if label not in string_store: string_store.add(label) ruler = EntityRuler(nlp) ruler.add_patterns(prepare_patterns(patterns, label)) return ruler
def main(): # load_data test1_filepath = 'data/test1.csv' test2_filepath = 'data/test2.csv' test3_filepath = 'data/test3.csv' test1 = pd.read_csv(test1_filepath, index_col=0) test2 = pd.read_csv(test2_filepath, index_col=0) test3 = pd.read_csv(test3_filepath, index_col=0) print('loaded data') # remove unneccesary characters clean_texts_1 = [replace_newline(t.lower()) for t in test1['text']] clean_texts_2 = [replace_newline(t.lower()) for t in test2['text']] clean_texts_3 = [replace_newline(t.lower()) for t in test3['text']] # apply tokenizer, tagger, parser and ner to text nlp = spacy.load('en_core_web_lg') ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler) print('entity ruler added') test1_docs = [doc for doc in nlp.pipe(clean_texts_1, batch_size=10)] test2_docs = [doc for doc in nlp.pipe(clean_texts_2, batch_size=10)] test3_docs = [doc for doc in nlp.pipe(clean_texts_3, batch_size=10)] test1 = multilabel_binarizer( df2multilabels(df_predictions(test1_docs, clean_texts_1))) # test1 = np.insert(test1, 6, 0, axis=1) test1 = pd.DataFrame(test1, columns=[ 'Australia', 'Brazil', 'China', 'France', 'India', 'Japan', 'Korea', 'Spain', 'UK', 'US' ]) test1.to_csv('data/test1_country_predictions.csv') print('test1 predictions saved') test2 = multilabel_binarizer( df2multilabels(df_predictions(test2_docs, clean_texts_2))) test2 = pd.DataFrame(test2, columns=[ 'Australia', 'Brazil', 'China', 'France', 'India', 'Japan', 'Korea', 'Spain', 'UK', 'US' ]) test2.to_csv('data/test2_country_predictions.csv') print('test2 predictions saved') test3 = multilabel_binarizer( df2multilabels(df_predictions(test3_docs, clean_texts_3))) test3 = np.insert(test3, 5, 0, axis=1) test3 = pd.DataFrame(test3, columns=[ 'Australia', 'Brazil', 'China', 'France', 'India', 'Japan', 'Korea', 'Spain', 'UK', 'US' ]) test3.to_csv('data/test3_country_predictions.csv') print('test3 predictions saved')
def setup(): nlp = spacy.load("en_core_web_sm") # or 'en' ruler = EntityRuler(nlp, overwrite_ents=True) sentencizer = nlp.create_pipe("sentencizer") ruler.add_patterns(patterns) nlp.add_pipe(sentencizer, first=True) nlp.add_pipe(expand_audit_numbers, first=True) nlp.add_pipe(ruler) return nlp
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) bytes_old_style = srsly.msgpack_dumps(ruler.patterns) new_ruler = EntityRuler(nlp) new_ruler = new_ruler.from_bytes(bytes_old_style) assert len(new_ruler) == len(ruler) assert new_ruler.patterns == ruler.patterns assert new_ruler.overwrite is not ruler.overwrite
def add_entity_ruler(self, patterns): """ To add entity ruler in nlp pipeline official doc: https://spacy.io/api/entityruler :param patterns: list or list of lists of token/phrase based patterns """ ruler = EntityRuler(self._nlp) ruler.add_patterns(patterns) self._nlp.add_pipe(ruler)
def add_ruler(entity_name, entity_arr): ruler = EntityRuler(nlp, overwrite_ents=True) for d in entity_arr: ruler.add_patterns([{ "label": str(entity_name), "pattern": str(d) }]) ruler.name = str(entity_name) print('adding ent ', str(entity_name)) return ruler
def __init__(self, nlp, name: str, patterns: Sequence[dict]): """ Initialise the PatternMatcher. Args: nlp : Spacy model patterns (Sequence[dict]): for the EntityRuler. See https://spacy.io/usage/rule-based-matching#entityruler """ self.ruler = EntityRuler(nlp) self.ruler.add_patterns(patterns)
class SpatialRecognizer(object): name = "spatial_recognizer" def __init__(self, nlp, label="SPATIAL_RE", **kwargs): self.label = nlp.vocab.strings[label] self.nlp = nlp self.ruler = EntityRuler(nlp, overwrite_ents=False, validate=True) self.is_debug = kwargs.get("is_debug") try: self.ruler = self.ruler.from_disk("../_data/spatial-re-patterns.jsonl") self._add_ruler_to_pipeline(nlp, self.ruler) except ValueError as ve: if self.is_debug: logger.error(f"{ve}: Ensure patterns file is added.") self._add_patterns() if self.is_debug: logger.debug(f"Pipeline -> {nlp.pipe_names}") def __call__(self, doc): return doc def construct_patters(self): """Load patterns adhoc if loading .jsonl file fails""" obj_patterns = [ {"label": "SPATIAL_RE", "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"LOWER": "behind"}, {"LOWER": "the"}]}, {"label": "SPATIAL_RE", "pattern": [{"LOWER": "behind"}, {"LOWER": "the"}]}, {"label": "SPATIAL_RE", "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"LOWER": "in"}, {"LOWER": "front"}]}, {"label": "SPATIAL_RE", "pattern": [{"LOWER": "that"}, {"TEXT": {"IN": ["is", "are"]}}, {"TEXT": {"IN": ["in", "on"]}}, {"LOWER": "the"}, {"TEXT": {"IN": ["left", "right"]}}]}, {"label": "SPATIAL_RE", "pattern": [{"LOWER": "to"}, {"LOWER": "the"}, {"TEXT": {"IN": ["right", "left"]}}, {"TEXT": {"IN": ["", "of"]}}]}, {"label": "SPATIAL_RE", "pattern": [{"TEXT": {"IN": ["above", "below"]}}, {"LOWER": "the"}]}, {"label": "SPATIAL_RE", "pattern": [{"TEXT": {"IN": ["left", "right", "front", "behind", "above", "below"] }}, ]} ] return obj_patterns def _add_patterns(self): patterns = self.construct_patters() #other_pipes = [p for p in self.nlp.pipe_names if p != "tagger"] # excluse tagger other_pipes = self.nlp.pipe_names with self.nlp.disable_pipes(*other_pipes): self.ruler.add_patterns(patterns) #self.ruler.add_patterns(patterns) self._add_ruler_to_pipeline(self.nlp, self.ruler) def _add_ruler_to_pipeline(self, nlp, ruler, name="spatial_entity_ruler"): if nlp.has_pipe(name): nlp.replace_pipe(name, self.ruler) else: nlp.add_pipe(self.ruler, name=name, last=True)
def extract_entities(text, output={}): nlp = English() ruler = EntityRuler(nlp) ruler.add_patterns(patterns.patterns + patterns.section_patterns) nlp.add_pipe(ruler) doc = nlp(text) logger.debug('TOKENS: {}'.format('\n'.join(['{} {}'.format(i, t.text) for i, t in enumerate(doc)]))) logger.debug('ENTITIES: {}'.format('\n'.join(['{}:{}'.format(ent.label_, ent.text) for ent in doc.ents]))) return update_output(doc, output)
def test_entity_ruler_serialize_bytes(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 3 ruler_bytes = ruler.to_bytes() new_ruler = EntityRuler(nlp) assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(ruler) == len(patterns) assert len(ruler.labels) == 3
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) with make_tempdir() as tmpdir: out_file = tmpdir / "entity_ruler" srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) new_ruler = EntityRuler(nlp).from_disk(out_file) for pattern in ruler.patterns: assert pattern in new_ruler.patterns assert len(new_ruler) == len(ruler) assert new_ruler.overwrite is not ruler.overwrite
def __init__(self, nlp: Language, data_path: Path = Path("data")): self.nlp = nlp self.data_path = data_path self.skills = self._get_skills() patterns = self._build_patterns(self.skills) extra_patterns = self._get_extra_skill_patterns() ruler = EntityRuler(nlp, overwrite_ents=True) ruler.add_patterns(itertools.chain(patterns, extra_patterns)) if not self.nlp.has_pipe("skills_ruler"): self.nlp.add_pipe(ruler, name="skills_ruler")
def add_ruler(self, patterns: Union[List[RulePattern], RulePattern], before: str = "ner") -> 'ConvenientSpacy': if isinstance(patterns, Pattern): patterns = [patterns] patterns = (List[Pattern])(patterns) ruler = EntityRuler(self.nlp) [ruler.add_patterns(pattern.asdict) for pattern in patterns] self.nlp.add_pipe(ruler, before=before) return self
def test_entity_ruler_init(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 assert "HELLO" in ruler assert "BYE" in ruler ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) doc = nlp("hello world bye bye") assert len(doc.ents) == 2 assert doc.ents[0].label_ == "HELLO" assert doc.ents[1].label_ == "BYE"
def alsoknownas(sentence): from spacy.pipeline import EntityRuler nlp = spacy.load('en_core_web_sm', disable=['ner']) rulerAlKnAs = EntityRuler(nlp, overwrite_ents=True) answer = sentence answer = answer.translate(str.maketrans('', '', string.punctuation)) aka_patterns = [ text for text in ('known as', 'nicknamed', 'known mononymously as', 'known professionally as') ] str1 = "" str2 = "" str3 = "" str4 = "" label = "" for aka in aka_patterns: if aka in answer: a = answer.split(aka, 1)[-1] name = a.split()[0] surname1 = a.split()[1] surname2 = a.split()[2] surname3 = a.split()[3] str1 = name str2 = name + " " + surname1 str3 = name + " " + surname1 + " " + surname2 str4 = name + " " + surname1 + " " + surname2 + " " + surname3 tokens = nltk.word_tokenize(str4) pos = nltk.pos_tag(tokens) if (pos[0][1] in {"NNP", "NN"} and pos[1][1] not in {"NNP", "NN"}): label = str1 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] not in {"NNP", "NN"}): label = str2 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] in {"NNP", "NN"} and pos[3][1] not in {"NNP", "NN"}): label = str3 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] in {"NNP", "NN"} and pos[3][1] in {"NNP", "NN"}): label = str4 for aka in aka_patterns: rulerAlKnAs.add_patterns([{"label": label, "pattern": aka}]) rulerAlKnAs.name = 'rulerAlKnAs' nlp.add_pipe(rulerAlKnAs) doc = nlp(answer) for ent in doc.ents: return (ent.label_)
def test_entity_ruler_multiprocessing(nlp, n_process): ruler = EntityRuler(nlp) texts = ["I enjoy eating Pizza Hut pizza."] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] ruler.add_patterns(patterns) nlp.add_pipe(ruler) for doc in nlp.pipe(texts, n_process=2): for ent in doc.ents: assert ent.ent_id_ == "1234"
def add_hard_coded_entities(nlp, filename): with open(filename) as neaF: named_entity_additions = json.load(neaF) ner_additions = EntityRuler(nlp, overwrite_ents=True, phrase_matcher_attr="LOWER") for key, value in named_entity_additions.items(): patterns = [{"label": key, "pattern": token} for token in value] # create the pattern dictionary ner_additions.add_patterns(patterns) base = ['tagger', "parser", "ner"] for pipe in list(set(nlp.pipe_names) - set(base)): nlp.remove_pipe(pipe) nlp.add_pipe(ner_additions, after="ner") return nlp
def add_name_rules_to_nlp(self, parsed_appointments): ruler = EntityRuler(self.instance) patterns = [] for appointment in parsed_appointments: pattern = {} visitor = appointment.visitor pattern['label'] = "PER" pattern['pattern'] = [] for part in visitor: pattern['pattern'].append({"lower": part}) patterns.append(pattern) ruler.add_patterns(patterns) self.instance.add_pipe(ruler)
def register_recognizer(self, recognizer_cls: Type[EntityRulerRecognizer]): recognizer = recognizer_cls() recognizer_name = recognizer_cls.__name__ ruler = EntityRuler(self.nlp) self.nlp.add_pipe(ruler, recognizer_name) rules = [{"label": recognizer.TAG, "pattern": pattern} for pattern in recognizer.patterns] ruler.add_patterns(rules) self.nlp.add_pipe( set_spacy_entity_extension_attributes(recognizer.SCORE, recognizer_name), name="label_" + recognizer_name, after=recognizer_name, )
def extract_sections(text, output={}): nlp = English() ruler = EntityRuler(nlp) ruler.add_patterns(patterns.section_patterns) nlp.add_pipe(ruler) nlp.add_pipe(expand_sections) doc = nlp(text) logger.debug('SESSION TOKENS: {}'.format( '\n'.join(['{} {}'.format(i, t.text) for i, t in enumerate(doc)]))) logger.debug('SESSION ENTITIES: {}'.format( '\n'.join(['{}:{}:{}:{}'.format(ent.label_, ent.text.strip(), ent.start, ent.end) for ent in doc.ents]))) return update_output(doc, output)
def train_entity_ruler(self) -> None: """Initializes Spacy's EntityRuler module, and sets self.entity_model In addition, updates Spacy's EntityRuler with the Train Data available """ nlp = spacy.load("en_core_web_sm") ruler = EntityRuler(nlp) patterns = self.generate_entity_train_data() ruler.add_patterns(patterns) nlp.add_pipe(ruler) self.entity_model = nlp
def __init__(self, nlp, label="SPATIAL_RE", **kwargs): self.label = nlp.vocab.strings[label] self.nlp = nlp self.ruler = EntityRuler(nlp, overwrite_ents=False, validate=True) self.is_debug = kwargs.get("is_debug") try: self.ruler = self.ruler.from_disk("../_data/spatial-re-patterns.jsonl") self._add_ruler_to_pipeline(nlp, self.ruler) except ValueError as ve: if self.is_debug: logger.error(f"{ve}: Ensure patterns file is added.") self._add_patterns() if self.is_debug: logger.debug(f"Pipeline -> {nlp.pipe_names}")
def my_nlp(model): nlp = spacy.load(model) list_infixes_defaults = list(nlp.Defaults.infixes) if reg in list_infixes_defaults: list_infixes_defaults.remove(reg) # modify tokenizer infix patterns(dd-dd-dd) infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"]) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler, before='ner') return nlp
def fijar_pipes(self): """Fija componentes adicionales del Pipeline de procesamiento.""" if not self.lang.has_pipe("cumplimiento"): self.lang.add_pipe(self.cumplimiento, name="cumplimiento", last=True) if self.grupos: if not self.lang.has_pipe("presencia"): self.lang.add_pipe(self.presencia, name="presencia", last=True) if self.entes: if not self.lang.has_pipe("entes"): ruler = EntityRuler(self.lang, phrase_matcher_attr="LOWER") ruler.add_patterns(self.crear_patrones()) self.lang.add_pipe(ruler, name="entes", before="ner")
def pattern_matcher(nlp, name: str, patterns: Sequence[dict]): """ Create an EntityRuler object loaded with a list of patterns. Args: nlp : Spacy model patterns (Sequence[dict]): for the EntityRuler. See https://spacy.io/usage/rule-based-matching#entityruler Returns: Spacy EntityRuler component """ ruler = EntityRuler(nlp) ruler.add_patterns(patterns) return ruler
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, overwrite_ents=True) ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) nlp.add_pipe(ruler) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def start(): print("Adding entity rules please wait...") file_terms = files.get_file_terms() dir_terms = files.get_dir_terms() ruler = EntityRuler(nlp, overwrite_ents=True) # list of words reserved for commands reserved_words = ["find", "copy", "open", "move"] _patterns = get_file_patterns(file_terms, reserved_words) [ _patterns.append(pattern) for pattern in get_dir_patterns(dir_terms, reserved_words) ] ruler.add_patterns(_patterns) nlp.add_pipe(ruler) main()
def test_entity_ruler_overlapping_spans(nlp): ruler = EntityRuler(nlp) patterns = [ { "label": "FOOBAR", "pattern": "foo bar" }, { "label": "BARBAZ", "pattern": "bar baz" }, ] ruler.add_patterns(patterns) doc = ruler(nlp.make_doc("foo bar baz")) assert len(doc.ents) == 1 assert doc.ents[0].label_ == "FOOBAR"
def run_cd2h(input_dir, output_dir, single_file_name): """ Process a set of input files per CD2H and the 2014 I2b2 challange and output results that can be evaluated :param input_dir: :param output_dir: :return: """ # Set up NLP nlp = spacy.load("en_core_web_md") # This breaks up words on dash slash and periods ( from end of sentences ) for better parsing infix_re = re.compile(r'''[-/,.]''') nlp.tokenizer = custom_tokenizer(nlp, infix_re) ruler = EntityRuler(nlp, overwrite_ents=True, validate=True).from_disk("./spacy_patterns.jsonl") nlp.add_pipe(ruler) # Dir of XML to process if single_file_name: entry = Path(f"{output_dir}/{ single_file_name}") token_list = process_xml_file(entry, output_dir, nlp) for token in token_list: print("TOKEN " + token) create_output_cd2h(output_dir, entry, token_list) else: entries = os.scandir(input_dir) for entry in entries: print(f"Processing input file : {entry.path} {entry.name}") token_list = process_xml_file(entry, output_dir, nlp) for token in token_list: print("TOKEN " + token) create_output_cd2h(output_dir, entry, token_list) sys.exit(0)
def test_issue7(): nlp = spacy.load("en_core_web_sm") negex = Negex(nlp) nlp.add_pipe(negex, last=True) ruler = EntityRuler(nlp) patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] doc = nlp("fgfgdghgdh")