Ejemplo n.º 1
0
    def preprocess(self):

        tokenizedTweets_writer = open(
            './daten/tokenized_tweets_normalized.txt', 'w')
        preprocTweets_writer = open(
            './daten/preprocessed_tweets_normalized.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # filter stopwords + normalize tokens
        lemmatizer = IWNLPWrapper(
            lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json')
        lemmatized_tokens = []
        for x in range(len(tweets_tkn)):
            tweet = []
            for token in tweets_tkn[x]:
                if token.lower() in stopwords.words('german'):
                    continue
                try:
                    lemma = lemmatizer.lemmatize_plain(token, ignore_case=True)
                    if (lemma):
                        tweet.append(lemma[0])
                    else:
                        tweet.append(token)

                except Exception as e:
                    print(e)

            lemmatized_tokens.append(tweet)

        assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(lemmatized_tokens)):
            t_tweet = (" ").join(lemmatized_tokens[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
Ejemplo n.º 2
0
class spaCyIWNLP(object):
    def __init__(self,
                 lemmatizer_path,
                 use_plain_lemmatization=False,
                 ignore_case=False):
        self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
        self.use_plain_lemmatization = use_plain_lemmatization
        self.ignore_case = ignore_case
        Token.set_extension('iwnlp_lemmas', getter=self.get_lemmas, force=True)

    def __call__(self, doc):
        for token in doc:
            token._.iwnlp_lemmas = self.get_lemmas(token)
        return doc

    def get_lemmas(self, token):
        if self.use_plain_lemmatization:
            return self.lemmatizer.lemmatize_plain(
                token.text, ignore_case=self.ignore_case)
        else:
            return self.lemmatizer.lemmatize(token.text,
                                             pos_universal_google=token.pos_)
class LemmatizerPlus(object):
    def __init__(self, lemmatizer_path, nlp):
        self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
        self.stringstore = nlp.vocab.strings
        # self.matcher = PhraseMatcher(nlp.vocab)
        Token.set_extension('iwnlp_lemmas', getter=self.lemmatize, force=True)
        self.lookup = {
            ('fast', ADV): 'fast',
        }

    def __call__(self, doc):
        for token in doc:
            token._.iwnlp_lemmas = self.lemmatize(token)
        return doc

    def lemmatize(self, token):
        """
        TODO: This doc is slightly outdated
        This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns
        with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
        Do not use this function to lemmatize phrases.
        :param token: white space stripped single token (str)
        :return: str # TODO: tuple of type (str, bool)
               value[0]: The lemma of the token if a lemma can be derived, else None.
               # TODO: value[1]: True if the token can be retrieved from the Wiktionary database as is,
               # else False.
        """
        text = token.text.strip()
        pos = token.pos_

        # nothing to lemmatize here
        if pos in {PHRASE, NPHRASE, PUNCT, SPACE, SYM}:
            return text
        # lemmatiaztions are odd on DET and NUM, so better leave it alone
        if pos in {DET, NUM}:
            return None

        # Wiktionary has no POS PROPN
        if pos == PROPN:
            pos = NOUN

        # first lookup token for given POS in dictionary
        if (text, pos) in self.lookup:
            return self.lookup[(text, pos)]

        value = None
        # default IWNLP lemmatization
        lemm = self.lemmatizer.lemmatize(text, pos)
        # default lemmatization hit?
        if lemm:
            value = lemm[0]

        # default lemmatization miss?
        # apply some rules to derive a lemma from the original token (nouns only)
        elif pos == NOUN:
            # first try default noun capitalization
            lemm = self.lemmatizer.lemmatize(text.title(), pos)
            if lemm:
                value = lemm[0]
            else:
                # still no results: try all noun suffixes
                # TODO: search for a more efficient implementation
                text_low = text.lower()
                tolerance = 3
                for i in range(1, len(text) - tolerance):
                    # looks ugly, but avoids full captitalization
                    text_edit = text_low[i].upper() + text_low[i + 1:]
                    lemm = self.lemmatizer.lemmatize(text_edit, pos)
                    if lemm:
                        value = (text[:i] + lemm[0]).title()
                        break

        # last try: plain lemmatization for all remaining POS tags
        else:
            lemm = self.lemmatizer.lemmatize_plain(text, ignore_case=True)
            if lemm:
                value = lemm[0]

        if value and pos in {
                ADJ, ADP, ADV, AUX, CCONJ, CONJ, INTJ, PART, PRON, SCONJ, VERB
        }:
            value = value.lower()

        if value:
            self.stringstore.add(value)
            self.lookup[(text, pos)] = value
        return value
Ejemplo n.º 4
0
class IWNLPWrapperTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.iwnlp = IWNLPWrapper(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')

    def test_lemmatize_plain_example1(self):
        predicted = self.iwnlp.lemmatize_plain('Hallo')
        self.assertEqual(predicted, ['Hallo'])

    def test_lemmatize_plain_example2(self):
        predicted = self.iwnlp.lemmatize_plain('Hallo', ignore_case=False)
        self.assertEqual(predicted, ['Hallo'])

    def test_lemmatize_plain_example3(self):
        predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=False)
        self.assertEqual(predicted, None)

    def test_lemmatize_plain_example4(self):
        predicted = self.iwnlp.lemmatize_plain('birne', ignore_case=True)
        self.assertEqual(predicted, ['Birne'])

    def test_lemmatize_plain_example5(self):
        predicted = self.iwnlp.lemmatize_plain('gespielt')
        self.assertEqual(predicted, ['spielen'])

    def test_lemmatize_plain_example6(self):
        predicted = self.iwnlp.lemmatize_plain('schnell')
        self.assertCountEqual(predicted, ['schnellen', 'schnell'])

    def test_lemmatize_plain_example7(self):
        predicted = self.iwnlp.lemmatize_plain('Gartenhäuser')
        self.assertEqual(predicted, ['Gartenhaus'])

    def test_contains_entry_example1(self):
        self.assertEqual(self.iwnlp.contains_entry('Birne'), True)

    def test_contains_entry_example2(self):
        self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=False), False)

    def test_contains_entry_example3(self):
        self.assertEqual(self.iwnlp.contains_entry('birne', ignore_case=True), True)

    def test_contains_entry_example4(self):
        self.assertEqual(self.iwnlp.contains_entry('groko'), False)

    def test_contains_entry_example5(self):
        self.assertEqual(self.iwnlp.contains_entry('GroKo'), True)

    def test_contains_entry_example6(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', ignore_case=True), True)

    def test_contains_entry_example7(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='Noun'), False)

    def test_contains_entry_example8(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='X'), False)

    def test_contains_entry_example9(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos='AdjectivalDeclension'), False)

    def test_contains_entry_example10(self):
        self.assertEqual(self.iwnlp.contains_entry('groko', pos=["Noun", "X"], ignore_case=True), True)

    def test_lemmatize_example1(self):
        predicted = self.iwnlp.lemmatize('Lkws', pos_universal_google='NOUN')
        self.assertEqual(predicted, ['Lkw'])

    def test_lemmatize_example2(self):
        predicted = self.iwnlp.lemmatize('gespielt', pos_universal_google='VERB')
        self.assertEqual(predicted, ['spielen'])

    def test_get_lemmas_example1(self):
        predicted = self.iwnlp.get_lemmas('groko', pos=["Noun", "X"], ignore_case=True)
        self.assertEqual(predicted, ['GroKo'])