def concept_sets(self, value): """ Sets concepts_sets and the attributes derived from it. Args: value (list of list of str): A list of lists of strings; each string being a concept, each set in the larger list corresponding to a document which has the tags seen in the set. """ self._concept_sets = value LOG.debug("Extracting raw keywords as concepts.") all_concepts = [ concept for concept_set in tqdm(self._concept_sets) for concept in concept_set if concept.strip() != "" ] raw_concepts = set(all_concepts) LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts))) concepts = [c.lower() for c in raw_concepts] self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)} lookups = Lookups() lookups.add_table("lemma_lookup", self.raw2lemma) self.lemmatizer = Lemmatizer(lookups) self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} lemma_concepts = [ self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts ] self.concepts_frequencies = Counter(lemma_concepts) self.concepts = set(lemma_concepts) self._fit_concept_indices()
def create_lookups_from_json_reader(path: Path) -> Lookups: lookups = Lookups() for p in path.glob("*.json"): table_name = p.stem data = srsly.read_json(p) lookups.add_table(table_name, data) return lookups
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert len(lookups) == 1 assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table["hello"] == "world" table["a"] = "b" assert table["a"] == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz") with pytest.raises(ValueError): lookups.add_table(table_name) table = lookups.remove_table(table_name) assert table.name == table_name assert len(lookups) == 0 assert table_name not in lookups with pytest.raises(KeyError): lookups.get_table(table_name)
def lemmatizer(): lookups = Lookups() lookups.add_table("lemma_lookup", { "dogs": "dog", "boxen": "box", "mice": "mouse" }) return Lemmatizer(lookups)
def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups
def make_lookups_bin(self, lookup_name_pattern='lemma_lookup_{}', filename_pattern='it_lemma_lookup_{}.json'): lookups = Lookups() lookup_keys = list(self.tag_map.keys()) for lookup_pos in lookup_keys: lookup_name = lookup_name_pattern.format(lookup_pos.lower()) filename = filename_pattern.format(lookup_pos.lower()) with open(os.path.join(self.out_path, filename)) as json_file: lookup_dict = json.load(json_file) lookups.add_table(lookup_name, lookup_dict) with open(os.path.join(self.out_path, 'it_lemma_lookup.json')) as json_file: lookup_dict = json.load(json_file) lookups.add_table('lemma_lookup', lookup_dict) lookups.to_disk(self.out_path, 'lookups.bin')
def test_lemmatizer_without_is_base_form_implementation(): # Norwegian example from #5658 lookups = Lookups() lookups.add_table("lemma_rules", {"noun": []}) lookups.add_table("lemma_index", {"noun": {}}) lookups.add_table("lemma_exc", {"noun": { "formuesskatten": ["formuesskatt"] }}) lemmatizer = Lemmatizer(lookups, is_base_form=None) assert lemmatizer("Formuesskatten", "noun", { 'Definite': 'def', 'Gender': 'masc', 'Number': 'sing' }) == ["formuesskatt"]
def test_lookups_to_from_disk(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) with make_tempdir() as tmpdir: lookups.to_disk(tmpdir) new_lookups = Lookups() new_lookups.from_disk(tmpdir) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2
def test_lookups_to_from_bytes(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) lookups_bytes = lookups.to_bytes() new_lookups = Lookups() new_lookups.from_bytes(lookups_bytes) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2 assert new_lookups.to_bytes() == lookups_bytes
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table.get("hello") == "world" table.set("a", "b") assert table.get("a") == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz")
def from_jsons( self, in_indices, in_raw2lemma ): # a little strange because it does not fill in all attributes """ Load index and raw2lemma dictionaries into empty ConceptExtractor Args: in_indices (): in_raw2lemma (): """ with open(in_indices, "r") as f0: self.concept_index_mapping = json.load(f0) with open(in_raw2lemma, "r") as f0: self.raw2lemma = json.load(f0) lookups = Lookups() lookups.add_table("lemma_lookup", self.raw2lemma) self.lemmatizer = Lemmatizer(lookups) self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} self.concepts = self.concept_index_mapping.keys() tmp_frequencies = { concept: len(index) for concept, index in self.concept_index_mapping.items() } self.concepts_frequencies = Counter(tmp_frequencies)
def test_lemmatizer_init(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert isinstance(lemmatizer.lookups, Lookups) assert not lemmatizer.lookups.tables assert lemmatizer.mode == "lookup" with pytest.raises(ValueError): nlp("test") nlp.initialize() assert lemmatizer.lookups.tables assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "cope" # replace any tables from spacy-lookups-data lemmatizer.lookups = Lookups() # lookup with no tables sets text as lemma assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "coped" nlp.remove_pipe("lemmatizer") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) with pytest.raises(ValueError): # Can't initialize without required tables lemmatizer.initialize(lookups=Lookups()) lookups = Lookups() lookups.add_table("lemma_lookup", {}) lemmatizer.initialize(lookups=lookups)
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) vocab = Vocab() doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def test_issue1387(): tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} lookups = Lookups() lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = "VBG" assert doc[0].text == "coping" assert doc[0].lemma_ == "cope"
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def lemmatize(self, tokens, toke=False): lookups = Lookups() lookups.add_table('lemma_index', lemma_index) lookups.add_table('lemma_exc', lemma_exc) lookups.add_table('lemma_rules', lemma_rules) lemmatizer = Lemmatizer(lookups) lemmas = [] for t in tokens: lemmas.append(lemmatizer(token.text, token.tag_)) if toke: return lemmas return " ".join(lemmas)
def lemmatize(): """""" lookups = Lookups() lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) lemmatizer = Lemmatizer(lookups) return lemmatizer
def create_lemmatizer(): lookups = Lookups() with open("lookups/fi_lemma_exc.json") as f: lookups.add_table("lemma_exc", json.load(f)) return FinnishLemmatizer(lookups)
# pip install -U spacy # ``` # # You will then need to download the English model: # ``` # spacy -m download en_core_web_sm # ``` # %% import spacy # %% from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups lookups = Lookups() lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) lemmatizer = Lemmatizer(lookups) # %% [lemmatizer.lookup(word) for word in word_list] # %% [markdown] # Spacy doesn't offer a stemmer (since lemmatization is considered better-- this is an example of being opinionated!) # %% [markdown] # Stop words vary from library to library # %% nlp = spacy.load("en_core_web_sm")