Ejemplo n.º 1
0
def create_lookups_from_json_reader(path: Path) -> Lookups:
    lookups = Lookups()
    for p in path.glob("*.json"):
        table_name = p.stem
        data = srsly.read_json(p)
        lookups.add_table(table_name, data)
    return lookups
Ejemplo n.º 2
0
    def concept_sets(self, value):
        """
        Sets concepts_sets and the attributes derived from it.

        Args:
            value (list of list of str): A list of lists of strings; each string being a concept,
                each set in the larger list corresponding to a document which has the tags seen in the set.
        """
        self._concept_sets = value
        LOG.debug("Extracting raw keywords as concepts.")
        all_concepts = [
            concept
            for concept_set in tqdm(self._concept_sets)
            for concept in concept_set
            if concept.strip() != ""
        ]
        raw_concepts = set(all_concepts)

        LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts)))
        concepts = [c.lower() for c in raw_concepts]

        self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)}
        lookups = Lookups()
        lookups.add_table("lemma_lookup", self.raw2lemma)
        self.lemmatizer = Lemmatizer(lookups)
        self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
        lemma_concepts = [
            self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts
        ]
        self.concepts_frequencies = Counter(lemma_concepts)
        self.concepts = set(lemma_concepts)
        self._fit_concept_indices()
Ejemplo n.º 3
0
    def __init__(
        self,
        vocab: Vocab,
        name: str = "morphologizer",
        *,
        overwrite_lemma: bool = False,
    ) -> None:
        super().__init__()

        self.name = name
        self.vocab = vocab
        self.voikko = libvoikko.Voikko("fi")
        self.lookups = Lookups()
        self.overwrite_lemma = overwrite_lemma
        self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]]
        self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]]
        self.nsubj_labels = [
            vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"]
        ]
        self.ccomp_labels = [
            vocab.strings.add(x)
            for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"]
        ]
        self.relcl_labels = [
            vocab.strings.add(x) for x in ["acl:relcl", "ccomp"]
        ]
        self.foreign_tag = vocab.strings.add('Foreign')
Ejemplo n.º 4
0
 def cope_lookups():
     lookups = Lookups()
     lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
     lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
     lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}})
     lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
     return lookups
Ejemplo n.º 5
0
def lemmatizer():
    lookups = Lookups()
    lookups.add_table("lemma_lookup", {
        "dogs": "dog",
        "boxen": "box",
        "mice": "mouse"
    })
    return Lemmatizer(lookups)
Ejemplo n.º 6
0
 def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
         nlp: Language = None
 ) -> None:
     lookups = Lookups()
     self._lookups = lookups.from_disk(path=self.source)
Ejemplo n.º 7
0
def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
    lookups.add_table("lemma_index", {"verb": {}})
    lookups.add_table("lemma_exc", {"verb": {}})
    vocab = Vocab()
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
    assert doc[2].lemma_ == "feed"
Ejemplo n.º 8
0
def test_issue1387():
    tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
    lookups = Lookups()
    lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
    lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}})
    lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
    lemmatizer = Lemmatizer(lookups)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=["coping"])
    doc[0].tag_ = "VBG"
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
Ejemplo n.º 9
0
def test_lookups_api():
    table_name = "test"
    data = {"foo": "bar", "hello": "world"}
    lookups = Lookups()
    lookups.add_table(table_name, data)
    assert table_name in lookups
    assert lookups.has_table(table_name)
    table = lookups.get_table(table_name)
    assert table.name == table_name
    assert len(table) == 2
    assert table.get("hello") == "world"
    table.set("a", "b")
    assert table.get("a") == "b"
    table = lookups.get_table(table_name)
    assert len(table) == 3
    with pytest.raises(KeyError):
        lookups.get_table("xyz")
Ejemplo n.º 10
0
def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
    lookups.add_table("lemma_index", {"verb": {}})
    lookups.add_table("lemma_exc", {"verb": {}})
    lemmatizer = Lemmatizer(lookups)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
    assert doc[2].lemma_ == "feed"
Ejemplo n.º 11
0
 def __init__(self):
     self.entities = []
     self.columns = []
     self.relationships = []
     self.synonyms_col = []
     self.synonyms_tab = []
     self.entity_graph = []
     self.loaded_entities = []
     self.config = Configuration()
     self.conn = pyodbc.connect(self.config.get_sql_connection_string())
     lookups = Lookups()
     self.lemmatizer = Lemmatizer(lookups)
     self.load_db_model()
Ejemplo n.º 12
0
def test_ner_warns_no_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 13
0
    def from_jsons(
        self, in_indices, in_raw2lemma
    ):  # a little strange because it does not fill in all attributes
        """
        Load index and raw2lemma dictionaries into empty ConceptExtractor

        Args:
            in_indices ():
            in_raw2lemma ():
        """
        with open(in_indices, "r") as f0:
            self.concept_index_mapping = json.load(f0)
        with open(in_raw2lemma, "r") as f0:
            self.raw2lemma = json.load(f0)
        lookups = Lookups()
        lookups.add_table("lemma_lookup", self.raw2lemma)
        self.lemmatizer = Lemmatizer(lookups)
        self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
        self.concepts = self.concept_index_mapping.keys()
        tmp_frequencies = {
            concept: len(index) for concept, index in self.concept_index_mapping.items()
        }
        self.concepts_frequencies = Counter(tmp_frequencies)
Ejemplo n.º 14
0
    def lemmatize(self, tokens, toke=False):

        lookups = Lookups()
        lookups.add_table('lemma_index', lemma_index)
        lookups.add_table('lemma_exc', lemma_exc)
        lookups.add_table('lemma_rules', lemma_rules)
        lemmatizer = Lemmatizer(lookups)

        lemmas = []
        for t in tokens:
            lemmas.append(lemmatizer(token.text, token.tag_))

        if toke:
            return lemmas

        return " ".join(lemmas)
Ejemplo n.º 15
0
 def make_lookups_bin(self,
                      lookup_name_pattern='lemma_lookup_{}',
                      filename_pattern='it_lemma_lookup_{}.json'):
     lookups = Lookups()
     lookup_keys = list(self.tag_map.keys())
     for lookup_pos in lookup_keys:
         lookup_name = lookup_name_pattern.format(lookup_pos.lower())
         filename = filename_pattern.format(lookup_pos.lower())
         with open(os.path.join(self.out_path, filename)) as json_file:
             lookup_dict = json.load(json_file)
         lookups.add_table(lookup_name, lookup_dict)
     with open(os.path.join(self.out_path,
                            'it_lemma_lookup.json')) as json_file:
         lookup_dict = json.load(json_file)
     lookups.add_table('lemma_lookup', lookup_dict)
     lookups.to_disk(self.out_path, 'lookups.bin')
Ejemplo n.º 16
0
def test_tagger_warns_no_lemma_lookups():
    nlp = Language()
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    tagger = nlp.create_pipe("tagger")
    with pytest.warns(UserWarning):
        tagger.begin_training()
    nlp.add_pipe(tagger)
    with pytest.warns(UserWarning):
        nlp.begin_training()
    nlp.vocab.lookups.add_table("lemma_lookup")
    with pytest.warns(None) as record:
        nlp.begin_training()
        assert not record.list
Ejemplo n.º 17
0
def test_ner_warns_no_lookups(caplog):
    nlp = English()
    assert nlp.lang in util.LEXEME_NORM_LANGS
    nlp.vocab.lookups = Lookups()
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
    with caplog.at_level(logging.DEBUG):
        nlp.initialize()
        assert "W033" in caplog.text
    caplog.clear()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with caplog.at_level(logging.DEBUG):
        nlp.initialize()
        assert "W033" not in caplog.text
def test_lemmatizer_without_is_base_form_implementation():
    # Norwegian example from #5658
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"noun": []})
    lookups.add_table("lemma_index", {"noun": {}})
    lookups.add_table("lemma_exc",
                      {"noun": {
                          "formuesskatten": ["formuesskatt"]
                      }})

    lemmatizer = Lemmatizer(lookups, is_base_form=None)
    assert lemmatizer("Formuesskatten", "noun", {
        'Definite': 'def',
        'Gender': 'masc',
        'Number': 'sing'
    }) == ["formuesskatt"]
Ejemplo n.º 19
0
def test_lemmatizer_init(nlp):
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    assert isinstance(lemmatizer.lookups, Lookups)
    assert not lemmatizer.lookups.tables
    assert lemmatizer.mode == "lookup"
    with pytest.raises(ValueError):
        nlp("test")
    nlp.initialize()
    assert lemmatizer.lookups.tables
    assert nlp("cope")[0].lemma_ == "cope"
    assert nlp("coped")[0].lemma_ == "cope"
    # replace any tables from spacy-lookups-data
    lemmatizer.lookups = Lookups()
    # lookup with no tables sets text as lemma
    assert nlp("cope")[0].lemma_ == "cope"
    assert nlp("coped")[0].lemma_ == "coped"
    nlp.remove_pipe("lemmatizer")
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
    with pytest.raises(ValueError):
        # Can't initialize without required tables
        lemmatizer.initialize(lookups=Lookups())
    lookups = Lookups()
    lookups.add_table("lemma_lookup", {})
    lemmatizer.initialize(lookups=lookups)
    docx = ""
    returnedSearch = HeadingSearch(filename)
    #If we didnt find anything with the heading search, just use the whole document.
    if (returnedSearch == False):
        document1 = docx2txt.process(filename)
        docx = nlp(document1)
        #Otherwise, send the headingsearch result through nlp
    else:
        docx = nlp(returnedSearch)

    word_frequencies = {}  # how many times each word occurs int the document
    words = [
    ]  # a list of every word in the document stores in the same index of the frequency array

    #spacy lemmatizer to get root words
    lookups = Lookups()
    lemmatizer = Lemmatizer(lookups)

    for word in docx:  # go through every word in document
        if word.text not in stopwords:  # as long as the word isnt a stop word
            if lemmatizer.lookup(word.text) not in word_frequencies.keys(
            ):  # if we havent come across the word yet
                word_frequencies[lemmatizer.lookup(
                    word.text)] = 1  # its frequency is one
                words.append(lemmatizer.lookup(word.text))  # add it to words
            else:
                word_frequencies[lemmatizer.lookup(
                    word.text
                )] += 1  # otherwise it is already in the list, so increment it

#Sort through the array by bubble sort
def test_lookups_to_from_disk():
    lookups = Lookups()
    lookups.add_table("table1", {"foo": "bar", "hello": "world"})
    lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
    with make_tempdir() as tmpdir:
        lookups.to_disk(tmpdir)
        new_lookups = Lookups()
        new_lookups.from_disk(tmpdir)
    assert len(new_lookups) == 2
    assert "table1" in new_lookups
    assert "table2" in new_lookups
    table1 = new_lookups.get_table("table1")
    assert len(table1) == 2
    assert table1["foo"] == "bar"
    table2 = new_lookups.get_table("table2")
    assert len(table2) == 3
    assert table2["b"] == 2
def test_lookups_to_from_bytes():
    lookups = Lookups()
    lookups.add_table("table1", {"foo": "bar", "hello": "world"})
    lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
    lookups_bytes = lookups.to_bytes()
    new_lookups = Lookups()
    new_lookups.from_bytes(lookups_bytes)
    assert len(new_lookups) == 2
    assert "table1" in new_lookups
    assert "table2" in new_lookups
    table1 = new_lookups.get_table("table1")
    assert len(table1) == 2
    assert table1["foo"] == "bar"
    table2 = new_lookups.get_table("table2")
    assert len(table2) == 3
    assert table2["b"] == 2
    assert new_lookups.to_bytes() == lookups_bytes
def test_lookups_api():
    table_name = "test"
    data = {"foo": "bar", "hello": "world"}
    lookups = Lookups()
    lookups.add_table(table_name, data)
    assert len(lookups) == 1
    assert table_name in lookups
    assert lookups.has_table(table_name)
    table = lookups.get_table(table_name)
    assert table.name == table_name
    assert len(table) == 2
    assert table["hello"] == "world"
    table["a"] = "b"
    assert table["a"] == "b"
    table = lookups.get_table(table_name)
    assert len(table) == 3
    with pytest.raises(KeyError):
        lookups.get_table("xyz")
    with pytest.raises(ValueError):
        lookups.add_table(table_name)
    table = lookups.remove_table(table_name)
    assert table.name == table_name
    assert len(lookups) == 0
    assert table_name not in lookups
    with pytest.raises(KeyError):
        lookups.get_table(table_name)
Ejemplo n.º 24
0
def lemmatize():
    """"""
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
    lemmatizer = Lemmatizer(lookups)
    return lemmatizer
Ejemplo n.º 25
0
 def from_disk(self, path, exclude=tuple()) -> "LookupLemmatizer":
     path: Path = ensure_path(path)
     lookups = Lookups()
     self._lookups = lookups.from_disk(path=path)
     return self
# ```
# pip install -U spacy
# ```
#
# You will then need to download the English model:
# ```
# spacy -m download en_core_web_sm
# ```

# %%
import spacy

# %%
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})

lemmatizer = Lemmatizer(lookups)

# %%
[lemmatizer.lookup(word) for word in word_list]

# %% [markdown]
# Spacy doesn't offer a stemmer (since lemmatization is considered better-- this is an example of being opinionated!)

# %% [markdown]
# Stop words vary from library to library

# %%
nlp = spacy.load("en_core_web_sm")
Ejemplo n.º 27
0
class MorphologizerLemmatizer(Pipe):
    """Pipeline component that assigns morphological features and lemmas to Docs.

    The actual morphological analysis is done by libvoikko.
    """
    compound_re = re.compile(r"\+(\w+)(?:\(\+?[\w=]+\))?")
    minen_re = re.compile(r"\b(\w+)\[Tn4\]mi")
    ny_re = re.compile(r"\[X\]\[\w+\]\[Ny\](\w+)")
    roman_numeral_structure_re = re.compile(r"=j+|=q+")
    voikko_cases = {
        "nimento": "Case=Nom",
        "omanto": "Case=Gen",
        "kohdanto": "Case=Acc",
        "olento": "Case=Ess",
        "osanto": "Case=Par",
        "tulento": "Case=Tra",
        "sisaolento": "Case=Ine",
        "sisaeronto": "Case=Ela",
        "sisatulento": "Case=Ill",
        "ulkoolento": "Case=Ade",
        "ulkoeronto": "Case=Abl",
        "ulkotulento": "Case=All",
        "vajanto": "Case=Abe",
        "seuranto": "Case=Com",
        "keinonto": "Case=Ins",
        "kerrontosti": "Case=Nom"  # Should never occur. "kerrontosti"
        # should only appear on ADVs, which
        # don't have cases.
    }
    voikko_classes_by_pos = {
        ADJ: frozenset(["laatusana", "nimisana_laatusana"]),
        ADP: frozenset(["nimisana", "seikkasana", "suhdesana"]),
        ADV: frozenset(["seikkasana"]),
        AUX: frozenset(["teonsana", "kieltosana"]),
        CCONJ: frozenset(["sidesana"]),
        INTJ: frozenset(["huudahdussana"]),
        NOUN: frozenset(["nimisana", "nimisana_laatusana", "lyhenne"]),
        NUM: frozenset(["lukusana"]),
        PRON: frozenset(["asemosana", "nimisana", "nimisana_laatusana"]),
        PROPN: frozenset(["nimi", "etunimi", "sukunimi", "paikannimi"]),
        SCONJ: frozenset(["sidesana"]),
        VERB: frozenset([]),  # Would be "teonsana" but
        # MINEN-infinitives are treated as noun.
        # See _analysis_has_compatible_pos()
        SYM: frozenset([]),
        X: frozenset([])
    }
    affix_to_sijamuoto = {
        "n": "omanto",
        "na": "olento",
        "nä": "olento",
        "a": "osanto",
        "ä": "osanto",
        "ta": "osanto",
        "tä": "osanto",
        "ksi": "tulento",
        "ssa": "sisaolento",
        "ssä": "sisaolento",
        "sta": "sisaeronto",
        "stä": "sisaeronto",
        "han": "sisatulento",
        "hin": "sisatulento",
        "hun": "sisatulento",
        "seen": "sisatulento",
        "siin": "sisatulento",
        "lla": "ulkoolento",
        "llä": "ulkoolento",
        "lta": "ulkoeronto",
        "ltä": "ulkoeronto",
        "lle": "ulkotulento",
        "tta": "vajanto",
        "ttä": "vajanto",
    }
    possessive_suffixes = {
        "1s": ["ni"],
        "2s": ["si"],
        "1p": ["mme"],
        "2p": ["nne"],
        "3": ["nsa", "nsä", "an", "en", "in"
              "on", "un", "yn", "än", "ön"],
    }
    voikko_degree = {
        "positive": "Degree=Pos",
        "comparative": "Degree=Cmp",
        "superlative": "Degree=Sup"
    }
    voikko_mood = {
        "A-infinitive": "InfForm=1",
        "E-infinitive": "InfForm=2",
        "MA-infinitive": "InfForm=3",
        "indicative": "Mood=Ind",
        "conditional": "Mood=Cnd",
        "potential": "Mood=Pot",
        "imperative": "Mood=Imp"
    }
    voikko_part_form = {
        "past_active": "PartForm=Past",
        "past_passive": "PartForm=Past",
        "present_active": "PartForm=Pres",
        "present_passive": "PartForm=Pres",
        "agent": "PartForm=Agt"
    }
    voikko_tense = {
        "present_active": "Tense=Pres",
        "present_passive": "Tense=Pres",
        "present_simple": "Tense=Pres",
        "past_active": "Tense=Past",
        "past_passive": "Tense=Past",
        "past_imperfective": "Tense=Past"
    }
    pron_types = {
        "minä": "Prs",
        "sinä": "Prs",
        "hän": "Prs",
        "me": "Prs",
        "te": "Prs",
        "he": "Prs",
        "tämä": "Dem",
        "tuo": "Dem",
        "se": "Dem",
        "nämä": "Dem",
        "nuo": "Dem",
        "ne": "Dem",

        # The relative "mikä" will be handled as a special case
        # separately so here we label all occurences of "mikä" as
        # interrogative.
        "mikä": "Int",
        "kuka": "Int",
        "ken": "Int",  # ketä
        "kumpi": "Int",
        "millainen": "Int",
        "kuinka": "Int",
        "miksi": "Int",

        # The relative "joka" will be handled else where. Here "joka"
        # is Voikko's lemmatization of jotakin, jollekin, jostakin, ...
        "joka": "Ind",
        "kaikki": "Ind",
        "jokainen": "Ind",
        "koko": "Ind",
        "harva": "Ind",
        "muutama": "Ind",
        "jokunen": "Ind",
        "yksi": "Ind",
        "ainoa": "Ind",
        "eräs": "Ind",
        "muuan": "Ind",
        "joku": "Ind",
        "jokin": "Ind",
        "kukin": "Ind",
        "moni": "Ind",
        "usea": "Ind",
        "molempi": "Ind",
        "kumpikin": "Ind",
        "kumpikaan": "Ind",
        "jompikumpi": "Ind",
        "sama": "Ind",
        "muu": "Ind",
        "kukaan": "Ind",
        "mikään": "Ind",
        "toinen": "Rcp"
    }
    pron_persons = {
        "minä": "1",
        "sinä": "2",
        "hän": "3",
        "me": "1",
        "te": "2",
        "he": "3"
    }
    infinite_moods = frozenset(
        ["A-infinitive", "E-infinitive", "MA-infinitive", "MAINEN-infinitive"])

    def __init__(
        self,
        vocab: Vocab,
        name: str = "morphologizer",
        *,
        overwrite_lemma: bool = False,
    ) -> None:
        super().__init__()

        self.name = name
        self.vocab = vocab
        self.voikko = libvoikko.Voikko("fi")
        self.lookups = Lookups()
        self.overwrite_lemma = overwrite_lemma
        self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]]
        self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]]
        self.nsubj_labels = [
            vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"]
        ]
        self.ccomp_labels = [
            vocab.strings.add(x)
            for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"]
        ]
        self.relcl_labels = [
            vocab.strings.add(x) for x in ["acl:relcl", "ccomp"]
        ]
        self.foreign_tag = vocab.strings.add('Foreign')

    def __call__(self, doc: Doc) -> Doc:
        error_handler = self.get_error_handler()
        try:
            for token in doc:
                if token.pos in (PUNCT, SPACE):
                    if self.overwrite_lemma or token.lemma == 0:
                        token.lemma = token.orth
                else:
                    analysis = self._analyze(token)
                    morph = self.voikko_morph(token, analysis)
                    if morph:
                        token.set_morph(morph)
                    if self.overwrite_lemma or token.lemma == 0:
                        token.lemma_ = self.lemmatize(token, analysis)
            return doc
        except Exception as e:
            error_handler(self.name, self, [doc], e)

    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        nlp: Optional[Language] = None,
        lookups: Optional[Lookups] = None,
    ):
        """Initialize the morphologizer and load in data.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        lookups (Lookups): The lookups object containing the (optional) tables
            such as "lemma_exc" and "morphologizer_exc". Defaults to None.
        """
        if lookups is None:
            lookups = load_lookups(lang=self.vocab.lang,
                                   tables=["lemma_exc", "morphologizer_exc"])
        self.lookups = lookups

    def voikko_morph(self, token: Token, analysis: dict) -> Optional[str]:
        # Run Voikko's analysis and convert the result to morph
        # features.
        exc_table = self.lookups.get_table("morphologizer_exc",
                                           {}).get(token.pos)
        if exc_table is not None:
            exc = exc_table.get(token.orth_.lower())
            if exc:
                return exc

        # Pre-compute some frequent morphs to avoid code duplication.
        # (Functions are not an option because the function call
        # overhead is too high.)

        # Clitic
        morph_clitic = None
        if "FOCUS" in analysis:
            focus = analysis["FOCUS"]
            if focus == "kin":
                morph_clitic = "Clitic=Kin"
            elif focus == "kaan":
                morph_clitic = "Clitic=Kaan"
            elif focus == "ka":
                morph_clitic = "Clitic=Ka"
        elif "KYSYMYSLIITE" in analysis:
            morph_clitic = "Clitic=Ko"

        morph_number = None
        morph_number_psor = None
        morph_person_psor = None
        if token.pos in (ADJ, ADP, ADV, AUX, NOUN, NUM, PRON, PROPN, VERB):
            # Number
            if "NUMBER" in analysis:
                number = analysis["NUMBER"]
                if number == "singular":
                    morph_number = "Number=Sing"
                elif number == "plural":
                    morph_number = "Number=Plur"

            # Number[psor] and Person[psor]
            if "POSSESSIVE" in analysis:
                possessive = analysis["POSSESSIVE"]
                if possessive == "1s":
                    morph_number_psor = "Number[psor]=Sing"
                    morph_person_psor = "Person[psor]=1"
                elif possessive == "1p":
                    morph_number_psor = "Number[psor]=Plur"
                    morph_person_psor = "Person[psor]=1"
                elif possessive == "3":
                    morph_person_psor = "Person[psor]=3"

        # Set morphs per POS
        morphology = []
        if token.pos in (ADJ, NOUN, PROPN):
            # Abbr
            if "CLASS" in analysis and analysis["CLASS"] == "lyhenne":
                morphology.append("Abbr=Yes")

            # Case
            if "SIJAMUOTO" in analysis:
                morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]])

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Degree
            if token.pos == ADJ and "COMPARISON" in analysis:
                morphology.append(self.voikko_degree[analysis["COMPARISON"]])

            # Number
            if morph_number is not None:
                morphology.append(morph_number)

            # Number[psor]
            if morph_number_psor is not None:
                morphology.append(morph_number_psor)

            # NumType
            if token.pos == ADJ and "NUMTYPE" in analysis:
                morphology.append(f'NumType={analysis["NUMTYPE"]}')

            # Person[psor]
            if morph_person_psor is not None:
                morphology.append(morph_person_psor)

        elif token.pos in (AUX, VERB):
            vclass = analysis.get("CLASS")

            # Abbr
            if vclass == "lyhenne":
                morphology.append("Abbr=Yes")

            # Case
            if "SIJAMUOTO" in analysis:
                morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]])

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Connegative
            if "CONNEGATIVE" in analysis:
                morphology.append("Connegative=Yes")

            # Degree
            if "COMPARISON" in analysis:
                morphology.append(self.voikko_degree[analysis["COMPARISON"]])

            # InfForm and Mood
            # These are mutually exclusive and both are based on MOOD
            mood = None
            if "MOOD" in analysis:
                mood = analysis["MOOD"]
                morph_inf_form_or_mood = self.voikko_mood.get(mood)
                if morph_inf_form_or_mood is not None:
                    morphology.append(morph_inf_form_or_mood)

            # Number
            if morph_number is not None:
                morphology.append(morph_number)

            # Number[psor]
            if morph_number_psor is not None:
                morphology.append(morph_number_psor)

            # PartForm
            participle = None
            if "PARTICIPLE" in analysis:
                participle = analysis["PARTICIPLE"]
                morph_part_form = self.voikko_part_form.get(participle)
                if morph_part_form:
                    morphology.append(morph_part_form)

            # Person
            person = None
            if "PERSON" in analysis:
                person = analysis["PERSON"]
                if person in ("0", "1", "2", "3"):
                    morphology.append(f"Person={person}")

            # Person[psor]
            if morph_person_psor is not None:
                morphology.append(morph_person_psor)

            # Polarity
            if vclass == "kieltosana":
                morphology.append("Polarity=Neg")

            # Tense
            if "TENSE" in analysis:
                morphology.append(self.voikko_tense[analysis["TENSE"]])

            # VerbForm
            if mood in self.infinite_moods:
                morphology.append("VerbForm=Inf")
            elif participle is not None:
                morphology.append("VerbForm=Part")
            else:
                morphology.append("VerbForm=Fin")

            # Voice
            if person in ("0", "1", "2", "3"):
                morphology.append("Voice=Act")
            elif person == "4":
                morphology.append("Voice=Pass")
            elif "VOICE" in analysis:
                morphology.append(f"Voice={analysis['VOICE']}")
            elif participle == "past_passive":
                morphology.append("Voice=Pass")
            elif participle in ("present_active", "past_active",
                                "present_passive"):
                morphology.append("Voice=Act")

        elif token.pos == ADV:
            # Abbr
            if "CLASS" in analysis and analysis["CLASS"] == "lyhenne":
                morphology.append("Abbr=Yes")

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Degree
            if "COMPARISON" in analysis:
                degree = analysis["COMPARISON"]
                if degree in ("comparative", "superlative"):
                    morphology.append(self.voikko_degree[degree])

            # Number[psor]
            if morph_number_psor is not None:
                morphology.append(morph_number_psor)

            # Person[psor]
            if morph_person_psor is not None:
                morphology.append(morph_person_psor)

        elif token.pos == PRON:
            # Case
            if "SIJAMUOTO" in analysis:
                morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]])

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Degree
            if "COMPARISON" in analysis:
                morphology.append(self.voikko_degree[analysis["COMPARISON"]])

            # Number
            if morph_number is not None:
                morphology.append(morph_number)

            # Number[psor]
            if morph_number_psor is not None:
                morphology.append(morph_number_psor)

            # Person
            if "PERSON" in analysis:
                person = analysis["PERSON"]
                if person in ("0", "1", "2", "3"):
                    morphology.append(f"Person={person}")

            # Person[psor]
            if morph_person_psor is not None:
                morphology.append(morph_person_psor)

            # PronType
            if "PRONTYPE" in analysis:
                morphology.append(f"PronType={analysis['PRONTYPE']}")

            # Reflex
            if "BASEFORM" in analysis and analysis["BASEFORM"] == "itse":
                morphology.append("Reflex=Yes")

        elif token.pos in (CCONJ, SCONJ):
            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

        elif token.pos == NUM:
            # Abbr
            if "CLASS" in analysis and analysis["CLASS"] == "lyhenne":
                morphology.append("Abbr=Yes")

            # Case
            if "SIJAMUOTO" in analysis:
                morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]])

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Number
            if morph_number is not None:
                morphology.append(morph_number)

            # NumType
            if "NUMTYPE" in analysis:
                morphology.append(f'NumType={analysis["NUMTYPE"]}')

        elif token.pos == ADP:
            # AdpType
            if "ADPTYPE" in analysis:
                morphology.append(f"AdpType={analysis['ADPTYPE']}")

            # Clitic
            if morph_clitic is not None:
                morphology.append(morph_clitic)

            # Number[psor]
            if morph_number_psor is not None:
                morphology.append(morph_number_psor)

            # Person[psor]
            if morph_person_psor is not None:
                morphology.append(morph_person_psor)

        elif token.pos == SYM:
            # Case
            if "SIJAMUOTO" in analysis:
                morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]])

        elif token.pos == X:
            # Foreign
            if token.tag == self.foreign_tag:
                morphology.append('Foreign=Yes')

        return "|".join(morphology) if morphology else None

    def lemmatize(self, token: Token, analysis: dict) -> str:
        cached_lower = None
        exc_table = self.lookups.get_table("lemma_exc", {}).get(token.pos)
        if exc_table is not None:
            cached_lower = token.orth_.lower()
            exc = exc_table.get(cached_lower)
            if exc:
                return exc

        # Some exceptions to Voikko's lemmatization algorithm to
        # better match UD lemmas
        if token.pos in (AUX, VERB) and "PARTICIPLE" in analysis:
            return self._participle_lemma(analysis)
        elif token.pos == NOUN and analysis.get("MOOD") == "MINEN-infinitive":
            return self._minen_noun_lemma(analysis)
        elif token.pos in (NOUN, NUM,
                           PROPN) and (colon_i := token.orth_.find(":")) > 0:
            # Lemma of inflected abbreviations: BBC:n, EU:ssa
            return token.orth_[:colon_i]
        elif token.pos == ADV:
            cached_lower = cached_lower or token.orth_.lower()
            return self._adv_lemma(analysis, cached_lower)
Ejemplo n.º 28
0
def create_lemmatizer():
    lookups = Lookups()
    with open("lookups/fi_lemma_exc.json") as f:
        lookups.add_table("lemma_exc", json.load(f))
    return FinnishLemmatizer(lookups)
Ejemplo n.º 29
0
def P(T):
    import pandas as pd
    import emoji  #checking if a character is an emojis
    from collections import Counter

    #remove the formating of source
    T['source'] = T['source'].str.lower()
    T['source'] = T['source'].str.findall('>([^<]+?)<').apply(
        lambda x: x[0] if len(x) >= 1 else '')

    #import location dictionary and generate country
    T['location'] = [
        T.loc[k, 'place']['country_code']
        if not pd.isnull(T.loc[k, 'place']) else i['location']
        for k, i in enumerate(T['user'])
    ]

    Trans = pd.read_csv(
        '/Users/livi/Documents/2020 Fall/data mining/Proposal/Tweepy related files/transloc.csv',
        index_col=0)
    Trans['googlemap'] = Trans['googlemap'].apply(eval)
    Trans.set_index('UserInfo', inplace=True)
    locdict = Trans.T.to_dict('records')
    locdict = locdict[0]
    kys = list(locdict.keys())
    for k in kys:
        if locdict[k] == None:
            del locdict[k]
        elif len(locdict[k]) != 0:
            if 'address_components' in locdict[k][0]:
                for ii in locdict[k][0]['address_components']:
                    if 'country' in ii['types']:
                        locdict[k] = ii['long_name']
                    else:
                        del locdict[k]
            elif len(locdict[k]) > 1:
                if 'address_components' in locdict[k][1]:
                    for ii in locdict[k][1]['address_components']:
                        if 'country' in ii['types']:
                            locdict[k] = ii['long_name']
                        else:
                            del locdict[k]
            else:
                del locdict[k]
        else:
            del locdict[k]

    ## Generate the column
    l = []
    for i in T['location']:
        try:
            l.append(locdict[i])
        except:
            l.append(float('nan'))
    T['CountryCode'] = l
    print('Finish Generate Country Code')

    #Generate Extended tweets and SDGs
    for i in range(len(T)):
        quote = None
        comment = None
        #prepare quote part
        if not pd.isnull(T.loc[i, 'quoted_status']):
            try:
                quote = T.loc[i,
                              'quoted_status']['extended_tweet']['full_text']
            except:
                quote = T.loc[i, 'quoted_status']['text']
                #print('no extended_tweet for quote',i)
        #prepare comment part
        if pd.isnull(T.loc[i, 'extended_tweet']):
            if pd.isnull(T.loc[i, 'retweeted_status']):
                try:
                    comment = T.loc[i, 'text']
                except:
                    print('no text', i)
            else:
                try:
                    comment = T.loc[
                        i, 'retweeted_status']['extended_tweet']['full_text']
                except:
                    comment = T.loc[i, 'retweeted_status']['text']
                    #print('no extended_tweet for retweeted status',i)
        else:
            try:
                comment = T.loc[i, 'extended_tweet']['full_text']
            except:
                print('no extended_tweet', i)
        #combine quote and comments
        if pd.isnull(quote):
            T.loc[i, 'extended_tweet'] = comment
        else:
            T.loc[i, 'extended_tweet'] = '\"' + comment + ' \" ' + quote
    ## remove some useless information
    T['extended_tweet'] = T['extended_tweet'].str.replace("http\S+", "")
    #T['extended_tweet']=T['extended_tweet'].str.replace("@\S+","")
    T['extended_tweet'] = T['extended_tweet'].str.replace("&amp", "")
    print('Finish Generate Extended Tweets')

    T = T.reset_index(drop=True)
    T['extended_tweet'] = T['extended_tweet'].str.lower()
    T['SDG'] = T['extended_tweet'].str.upper()
    T['SDG'] = T['SDG'].str.findall('(SDG\d+)')
    print('Finish Generate SDGs')

    # Generate User Information and hashtags
    T['id'] = [i['id'] for i in T['user']]
    #T['name']=[i['name']for i in T['user']]
    T['screen_name'] = [i['screen_name'] for i in T['user']]
    T['url'] = [i['url'] for i in T['user']]
    T['friends_count'] = T['user'].apply(lambda x: x['friends_count'])
    T['followers_count'] = T['user'].apply(lambda x: x['followers_count'])
    T['hashtags'] = T['extended_tweet'].str.findall('#\S+')
    print('Finish Generate UserInfo and Hashtags')

    # Prepare lemmatized analysis and tokenized extended tweets
    def char_is_emoji(character):
        return character in emoji.UNICODE_EMOJI  #does the text contain an emoji?

    def text_has_emoji(text):
        for character in text:
            if character in emoji.UNICODE_EMOJI:
                return True
        return False  #remove the emoji

    def deEmojify(inputString):
        return inputString.encode('ascii', 'ignore').decode('ascii')

    T['extended_tweet'] = T['extended_tweet'].apply(lambda x: deEmojify(x))

    import spacy
    from spacy.lemmatizer import Lemmatizer
    from spacy.lookups import Lookups
    sp = spacy.load('en')
    lookups = Lookups()
    lemm = Lemmatizer(lookups)

    def lemma_function(text):
        dummy = []
        #this is just a test to see if it works
        for word in sp(text):
            dummy.append(word.lemma_)
        return ' '.join(dummy)

    T['extended_tweet_lemmatized'] = T['extended_tweet'].apply(
        lambda x: lemma_function(x))
    T['extended_tweet_lemmatized'] = T['extended_tweet_lemmatized'].apply(
        lambda x: x.replace('-PRON-', ''))
    print('Finish deemoji and lemmatization')

    # Generate Sentiment Scores
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyser = SentimentIntensityAnalyzer()

    def sentiment_analyzer_scores(sentence):
        score = analyser.polarity_scores(sentence)
        print("{:-<40} {}".format(sentence, str(score)))

    T['neg'] = T['extended_tweet_lemmatized'].apply(
        lambda x: analyser.polarity_scores(x)['neg'])
    T['neu'] = T['extended_tweet_lemmatized'].apply(
        lambda x: analyser.polarity_scores(x)['neu'])
    T['pos'] = T['extended_tweet_lemmatized'].apply(
        lambda x: analyser.polarity_scores(x)['pos'])
    T['compound'] = T['extended_tweet_lemmatized'].apply(
        lambda x: analyser.polarity_scores(x)['compound'])
    print('Finish Generate Sentiment Score')
    return T
Ejemplo n.º 30
0
def morphology():
    lemmatizer = Lemmatizer(Lookups())
    return Morphology(StringStore(), {}, lemmatizer)