Example #1
0
    def __init__(self):
        with Timer() as self.model_load_time:
            from iwnlp.iwnlp_wrapper import IWNLPWrapper
            from stts2upos import conv_table
            data_loc = "/opt/iwnlp/IWNLP.Lemmatizer_20181001.json"
            self.lemmatizer = IWNLPWrapper(lemmatizer_path=data_loc)

            def myprocessor(myinput):
                mydoc = string2doc(myinput)
                for sent in mydoc:
                    for tok in sent:
                        try:
                            matching_lemmas = self.lemmatizer.lemmatize(
                                tok.word, conv_table.get(tok.xpos))
                            if matching_lemmas is None:
                                tok.lemma = "_"
                                # elif len(matching_lemmas) > 1:
                                #     print("lots o lemmas!", matching_lemmas)
                            else:
                                # unclear how to select best alternative
                                # just use first item in list
                                tok.lemma = matching_lemmas[0]
                        except ValueError:
                            tok.lemma = "_"
                        # don't repeat gold pos in output
                        tok.hide_fields(HIDDEN_FIELDS)
                return mydoc

            self.processor = myprocessor
 def __init__(self, lemmatizer_path, nlp):
     self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
     self.stringstore = nlp.vocab.strings
     # self.matcher = PhraseMatcher(nlp.vocab)
     Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True)
     self.lookup = {
         ('fast', ADV): 'fast',
     }
Example #3
0
 def __init__(self,
              lemmatizer_path,
              use_plain_lemmatization=False,
              ignore_case=False):
     self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
     self.use_plain_lemmatization = use_plain_lemmatization
     self.ignore_case = ignore_case
     Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True)
Example #4
0
 def __init__(self):
     self.logger = logging.getLogger()
     self.logger.setLevel(logging.DEBUG)
     self.lemmatizer = IWNLPWrapper(
         lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json')
     self.sentiws = SentiWSWrapper(sentiws_path='data/sentiws')
     self.logger.debug('Loading Spacy model')
     self.nlp = spacy.load('de')
     self.logger.debug('Spacy model loaded')
Example #5
0
class Lemmatizer(BaseEstimator):

    def __init__(self, lang):
        self.lang = lang
        self.nlp = nlp = spacy.load(lang)
        current_dir = os.path.dirname(__file__)
        self.iwnlp = IWNLPWrapper(lemmatizer_path=current_dir + '/../resources/IWNLP.Lemmatizer_20170501.json')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        ret_list = []
        for row in X:
            doc = self.nlp(row)

            # workaround until german lemmatizer is integrated in spacy
            if self.lang == 'de':
                new_row = self._lemmatize_german(doc)
            else:
                new_row = ' '.join([word.lemma_ for word in doc])
            ret_list.append(new_row)
        return ret_list

    def _lemmatize_german(self, doc):
        new_row = ''
        for word in doc:
            lemmatized = self.iwnlp.lemmatize(str(word), word.pos_)
            if lemmatized is not None:
                new_row += ' ' + lemmatized[0]
            else:
                new_row += ' ' + str(word)
        return new_row
Example #6
0
    def preprocess(self):

        tokenizedTweets_writer = open(
            './daten/tokenized_tweets_normalized.txt', 'w')
        preprocTweets_writer = open(
            './daten/preprocessed_tweets_normalized.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # filter stopwords + normalize tokens
        lemmatizer = IWNLPWrapper(
            lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json')
        lemmatized_tokens = []
        for x in range(len(tweets_tkn)):
            tweet = []
            for token in tweets_tkn[x]:
                if token.lower() in stopwords.words('german'):
                    continue
                try:
                    lemma = lemmatizer.lemmatize_plain(token, ignore_case=True)
                    if (lemma):
                        tweet.append(lemma[0])
                    else:
                        tweet.append(token)

                except Exception as e:
                    print(e)

            lemmatized_tokens.append(tweet)

        assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(lemmatized_tokens)):
            t_tweet = (" ").join(lemmatized_tokens[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
Example #7
0
class spaCyIWNLP(object):
    def __init__(self,
                 lemmatizer_path,
                 use_plain_lemmatization=False,
                 ignore_case=False):
        self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
        self.use_plain_lemmatization = use_plain_lemmatization
        self.ignore_case = ignore_case
        Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True)

    def __call__(self, doc):
        for token in doc:
            token._.iwnlp_lemmas = self.get_lemmas(token)
        return doc

    def get_lemmas(self, token):
        if self.use_plain_lemmatization:
            return self.lemmatizer.lemmatize_plain(
                token.text, ignore_case=self.ignore_case)
        else:
            return self.lemmatizer.lemmatize(token.text,
                                             pos_universal_google=token.pos_)
Example #8
0
class SpacyWrapper(object):
    def __init__(self):
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)
        self.lemmatizer = IWNLPWrapper(
            lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json')
        self.sentiws = SentiWSWrapper(sentiws_path='data/sentiws')
        self.logger.debug('Loading Spacy model')
        self.nlp = spacy.load('de')
        self.logger.debug('Spacy model loaded')

    def process_sentence(self, sentence):
        result = self.nlp(sentence)
        tokens = []
        dependencies = []
        for token in result:
            iwnlp_lemma = self.lemmatizer.lemmatize(
                token.text, pos_universal_google=token.pos_)
            sentiws = self.sentiws.determine(token.text,
                                             pos_universal_google=token.pos_)
            token_model = Token(token.i + 1,
                                text=token.text,
                                spacy_pos_stts=token.tag_,
                                spacy_pos_universal_google=token.pos_,
                                iwnlp_lemma=iwnlp_lemma,
                                spacy_ner_type=token.ent_type_,
                                spacy_ner_iob=token.ent_iob_,
                                spacy_is_punct=token.is_punct,
                                spacy_is_space=token.is_space,
                                spacy_like_num=token.like_num,
                                spacy_like_url=token.like_url,
                                spacy_shape=token.shape_,
                                polarity_sentiws=sentiws)
            tokens.append(token_model)
            dependency_model = Dependency(token.i + 1, token.dep_,
                                          token.head.i + 1)
            dependencies.append(dependency_model)
            # print(token_model.token_index_in_sentence, token_model.text.encode('utf-8'),
            # format_iwnlp_lemma(token_model.iwnlp_lemma), token_model.spacy_pos_stts,
            # token_model.spacy_pos_universal_google, token_model.spacy_ner_type, token_model.spacy_ner_iob)
        return {'tokens': tokens, 'dependencies': dependencies}
 def load(cls, lemmatizer_path):
     lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
     return cls(lemmatizer)
class LemmatizerPlus(object):
    def __init__(self, lemmatizer_path, nlp):
        self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
        self.stringstore = nlp.vocab.strings
        # self.matcher = PhraseMatcher(nlp.vocab)
        Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True)
        self.lookup = {
            ('fast', ADV): 'fast',
        }

    def __call__(self, doc):
        for token in doc:
            token._.iwnlp_lemmas = self.lemmatize(token)
        return doc

    def lemmatize(self, token):
        """
        TODO: This doc is slightly outdated
        This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns
        with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
        Do not use this function to lemmatize phrases.
        :param token: white space stripped single token (str)
        :return: str # TODO: tuple of type (str, bool)
               value[0]: The lemma of the token if a lemma can be derived, else None.
               # TODO: value[1]: True if the token can be retrieved from the Wiktionary database as is,
               # else False.
        """
        text = token.text.strip()
        pos = token.pos_

        # nothing to lemmatize here
        if pos in {PHRASE, NPHRASE, PUNCT, SPACE, SYM}:
            return text
        # lemmatiaztions are odd on DET and NUM, so better leave it alone
        if pos in {DET, NUM}:
            return None

        # Wiktionary has no POS PROPN
        if pos == PROPN:
            pos = NOUN

        # first lookup token for given POS in dictionary
        if (text, pos) in self.lookup:
            return self.lookup[(text, pos)]

        value = None
        # default IWNLP lemmatization
        lemm = self.lemmatizer.lemmatize(text, pos)
        # default lemmatization hit?
        if lemm:
            value = lemm[0]

        # default lemmatization miss?
        # apply some rules to derive a lemma from the original token (nouns only)
        elif pos == NOUN:
            # first try default noun capitalization
            lemm = self.lemmatizer.lemmatize(text.title(), pos)
            if lemm:
                value = lemm[0]
            else:
                # still no results: try all noun suffixes
                # TODO: search for a more efficient implementation
                text_low = text.lower()
                tolerance = 3
                for i in range(1, len(text) - tolerance):
                    # looks ugly, but avoids full captitalization
                    text_edit = text_low[i].upper() + text_low[i + 1:]
                    lemm = self.lemmatizer.lemmatize(text_edit, pos)
                    if lemm:
                        value = (text[:i] + lemm[0]).title()
                        break

        # last try: plain lemmatization for all remaining POS tags
        else:
            lemm = self.lemmatizer.lemmatize_plain(text, ignore_case=True)
            if lemm:
                value = lemm[0]

        if value and pos in {
                ADJ, ADP, ADV, AUX, CCONJ, CONJ, INTJ, PART, PRON, SCONJ, VERB
        }:
            value = value.lower()

        if value:
            self.stringstore.add(value)
            self.lookup[(text, pos)] = value
        return value
class Preprocess:
    # zur Lemmatisierung im Deutschen

    nlp = spacy.load('de')

    # IWNLP German Lemmatizations:
    dirname = os.path.dirname(__file__)
    iwnlp_file = os.path.join(dirname, 'data/IWNLP.Lemmatizer_20181001.json')
    #iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json', ignore_case=True)
    lemmatizer = IWNLPWrapper(lemmatizer_path=iwnlp_file)

    #add custom tokenizer
    nlp.tokenizer = custom_tokenizer(nlp)
    '''
    try:
        # add pipes
        nlp.add_pipe(iwnlp)
        # nlp.add_pipe(__set_custom_boundaries, before='parser')
    except Exception:
        pass
    '''

    stopwords_to_remove_from_default_set = [
        "schlecht", "mensch", "menschen", "beispiel", "gott", "jahr", "jahre",
        "jahren", "nicht", "uhr"
    ]
    for stopword in stopwords_to_remove_from_default_set:
        nlp.vocab[stopword].is_stop = False

    #Spacy Token Tags, which will be removed by preprocessing
    tags_to_remove = [
        '$(',
        '$,',
        '$.',
        'APPR',
        'APPO',
        'APPRART',
        'APZR',
        'ART',
        'ITJ',
        'KOKOM',
        'KON',
        'KOUI',
        'KOUS',  # 'CARD',
        'PDS',
        'PAV',
        'PROAV',
        'PDAT',
        'PIAT',
        'PIDAT',
        'PIS',
        'PPER',
        'PPOSAT',
        'PPOSS',
        'PRELAT',
        'PRELS',
        'PRF',
        'PTKA',  # 'PTKANT',
        'PTKVZ',
        'PTKZU',
        'PWAT',
        'PWAV',
        'PWS',
        'TRUNC',
        'XY',
        'SP',
        'WRP'
    ]

    def __init__(self, text, split_in_sentences=True, with_pos=False):
        '''

        :param text: input text
        :param split_in_sentences: split text in sentences --> sub-arrays for sentences in Preprocess-result
        :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens
        '''

        self.text = text
        self.nlp_text = self.nlp(text)

        self.maintain_indeces = []

        self.noun_chunks = self.get_noun_chunks(cleaned=True, flattened=True)
        self.maintain_indeces.extend(index for index in self.noun_chunks
                                     if index not in self.maintain_indeces)

        self.named_entities = self.get_named_entities(flattened=True)
        self.maintain_indeces.extend(index for index in self.named_entities
                                     if index not in self.maintain_indeces)
        self.maintain_indeces.sort()

        self.preprocessed = self.preprocess(sentence_split=split_in_sentences,
                                            with_pos=with_pos)

    def __get_lemma(self, token):
        '''
        take lemma of IWNLP, if given, else spacy lemma
        :param token: spacy-token
        :return: lemmatization
        '''
        #lemma_iwnlp_list = token._.iwnlp_lemmas
        lemma_iwnlp_list = self.lemmatizer.lemmatize_plain(token.text,
                                                           ignore_case=False)
        if lemma_iwnlp_list:
            lemma_iwnlp = lemma_iwnlp_list[0]
            #print(token, ":::", lemma_iwnlp_list[0])
            return lemma_iwnlp

        return token.lemma_

    def get_named_entities(self, only_indeces=True, flattened=False):
        '''
        return array of named entities (PER: Person, LOC: Location, ORG: Named corporate, governmental, or other organizational entity, MISC: Miscellaneous entities, e.g. events, nationalities, products or works of art)
        :param only_indeces:
        :param flattened: returns only 1d array, else related entities are in sup-arrays
        :return: array with named entities
        '''
        if flattened:
            named_ents = [
                word.i if only_indeces else (word.i, word, ents.label_)
                for ents in self.nlp_text.ents for word in ents
            ]
        else:
            named_ents = [[
                word.i if only_indeces else (word.i, word, ents.label_)
                for word in ents
            ] for ents in self.nlp_text.ents]
        return named_ents

    def get_noun_chunks(self,
                        only_indices=True,
                        cleaned=True,
                        flattened=False):
        '''
        return array of noun_chunks/noun_phrases of the Text object
        :param only_indices:
        :param cleaned: noun phrases without stopword, punctuation
        :param flattened: returns only 1d array, else related phrases are in sup-arrays
        :return: array with noun-phrases
        '''

        # noun_words = [(word.i, word) for ent in text.noun_chunks for word in ent]
        # noun_words = [[(word.i, word) for word in ent] for ent in text.noun_chunks]
        if flattened:
            if cleaned:
                noun_words = [
                    word.i if only_indices else (word.i, word)
                    for ent in self.nlp_text.noun_chunks for word in ent
                    if self.__is_valid_token(word)
                ]
            else:
                noun_words = [
                    word.i if only_indices else (word.i, word)
                    for ent in self.nlp_text.noun_chunks for word in ent
                ]
        else:
            if cleaned:
                noun_words = [[
                    word.i if only_indices else (word.i, word) for word in ent
                    if self.__is_valid_token(word)
                ] for ent in self.nlp_text.noun_chunks]
            else:
                noun_words = [[
                    word.i if only_indices else (word.i, word) for word in ent
                ] for ent in self.nlp_text.noun_chunks]

        return noun_words

    def __is_valid_token(self, token):
        '''
        checks if token is valid: no stopword, punctuation oder whitespace
        :param token: spacy-token
        :return: bool
        '''
        # nlp(token.lower_)[0] wegen spacy bug --> z.B. "Der" würde nicht als stopwort erkannt werden, "der" aber schon
        if not self.nlp(
                token.lower_
        )[0].is_stop and not token.is_punct and not token.is_space:
            return True

        return False

    def __tokenize_words(self, doc, with_pos=False):
        '''
        tokenizes text and removes unimportant tokens
        :param doc: input spacy doc
        :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens
        :return: 1d array of tokens
        '''
        tokenized_text = [
            (token.idx, token.idx + len(token),
             self.__get_lemma(token).lower())
            if with_pos else self.__get_lemma(token).lower() for token in doc
            if self.__is_valid_token(token) and not token.tag_ in
            self.tags_to_remove or token.i in self.maintain_indeces
        ]

        return tokenized_text

    def __tokenize_to_list_sentences(self, with_pos=False):
        '''
        tokenizes text and removes unimportant tokens, split by sentences
        :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens
        :return: 2d array of tokens in sub-arrays (sentences)
        '''
        filtered_text = []
        for sentence in self.nlp_text.sents:
            filtered_sentence = self.__tokenize_words(sentence,
                                                      with_pos=with_pos)
            filtered_text.append(filtered_sentence)

        return filtered_text

    def preprocess(self, sentence_split=True, with_pos=False):
        '''
        preprocess text. removes unimportant tokens
        :param sentence_split: split by sentences
        :param with_pos: true: give tripel with (<startpos in orig-text>, <endpos in origtext>, token), else only tokens
        :return: 1d or 2d array with preprocessed text
        '''
        if sentence_split:
            preprocessed_text = self.__tokenize_to_list_sentences(
                with_pos=with_pos)
        else:
            preprocessed_text = self.__tokenize_words(self.nlp_text,
                                                      with_pos=with_pos)

        return preprocessed_text
Example #12
0
 def setUpClass(self):
     self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')
Example #13
0
class IWNLPWrapperTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')

    def test_lemmatize_plain_example1(self):
        predicted = self.iwnlp.lemmatize_plain('Hallo')
        self.assertEqual(predicted, ['Hallo'])

    def test_lemmatize_plain_example2(self):
        predicted = self.iwnlp.lemmatize_plain('Hallo', ignore_case=False)
        self.assertEqual(predicted, ['Hallo'])

    def test_lemmatize_plain_example3(self):
        predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=False)
        self.assertEqual(predicted, None)

    def test_lemmatize_plain_example4(self):
        predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=True)
        self.assertEqual(predicted, ['Birne'])

    def test_lemmatize_plain_example5(self):
        predicted = self.iwnlp.lemmatize_plain('gespielt')
        self.assertEqual(predicted, ['spielen'])

    def test_lemmatize_plain_example6(self):
        predicted = self.iwnlp.lemmatize_plain('schnell')
        self.assertCountEqual(predicted, ['schnellen', 'schnell'])

    def test_lemmatize_plain_example7(self):
        predicted = self.iwnlp.lemmatize_plain('Gartenhäuser')
        self.assertEqual(predicted, ['Gartenhaus'])

    def test_contains_entry_example1(self):
        self.assertEqual(self.iwnlp.contains_entry('Birne'), True)

    def test_contains_entry_example2(self):
        self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=False), False)

    def test_contains_entry_example3(self):
        self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=True), True)

    def test_contains_entry_example4(self):
        self.assertEqual(self.iwnlp.contains_entry('groko'), False)

    def test_contains_entry_example5(self):
        self.assertEqual(self.iwnlp.contains_entry('GroKo'), True)

    def test_contains_entry_example6(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', ignore_case=True), True)

    def test_contains_entry_example7(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='Noun'), False)

    def test_contains_entry_example8(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='X'), False)

    def test_contains_entry_example9(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='AdjectivalDeclension'), False)

    def test_contains_entry_example10(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos=["Noun", "X"], ignore_case=True), True)

    def test_lemmatize_example1(self):
        predicted = self.iwnlp.lemmatize('Lkws', pos_universal_google='NOUN')
        self.assertEqual(predicted, ['Lkw'])

    def test_lemmatize_example2(self):
        predicted = self.iwnlp.lemmatize('gespielt', pos_universal_google='VERB')
        self.assertEqual(predicted, ['spielen'])

    def test_get_lemmas_example1(self):
        predicted = self.iwnlp.get_lemmas('groko', pos=["Noun", "X"], ignore_case=True)
        self.assertEqual(predicted, ['GroKo'])
Example #14
0
from iwnlp.iwnlp_wrapper import IWNLPWrapper

output = 'output/Baselist.txt'
path = "Baselist.txt"
lemmatizer = IWNLPWrapper(lemmatizer_path='IWNLP.Lemmatizer_20170501.json')

with open(path, 'r') as read_file:
    data = read_file.read().splitlines()

tokens = []
lemmatized = []
tags = []

for i in data:
    j = i.split('|')
    tokens.append(j[0].lower())
    tags.append(j[1])

for token, tag in zip(tokens, tags):
    lemma = lemmatizer.lemmatize(token, pos_universal_google=tag)
    lemmatized.append(lemma)

for token, lemma in zip(tokens, lemmatized):
    ''
#   print(token, lemma)

with open(output, 'w') as write_file:
    for i, lemma in enumerate(lemmatized):
        if (lemmatized[i] == None):
            write_file.write(tokens[i] + '|' + tags[i] + "\n")
        else:
Example #15
0
 def __init__(self, lang):
     self.lang = lang
     self.nlp = nlp = spacy.load(lang)
     current_dir = os.path.dirname(__file__)
     self.iwnlp = IWNLPWrapper(lemmatizer_path=current_dir + '/../resources/IWNLP.Lemmatizer_20170501.json')