Exemple #1
0
    def __init__(self, **kwargs):
        """
            Sentiment-Analyzer for german texts.
            Get the polarity values of words depending on
            polarity values of associated descriptive words
            e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne'

            Purpose: find out in which sentiment context your keywords appear in a text.
            Note: Works with spacy, nltk and germalemma
        """
        sentiws_path = kwargs.get(
            'sentiws_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/sentiws.pickle"))
        polarity_mod_path = kwargs.get(
            'polarity_modifiers_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/polarity_modifiers.pickle"))
        negations_path = kwargs.get(
            'negations_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/negationen_lexicon.pickle"))
        stts_path = kwargs.get(
            'stts_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/stts.pickle"))
        self.sentiws = pickle.load(open(sentiws_path, 'rb'))
        self.polarity_modifications = pickle.load(open(polarity_mod_path,
                                                       'rb'))
        self.negations = pickle.load(open(negations_path, 'rb'))
        self.nlp = spacy.load("de_core_news_md")
        self.germalemmatizer = GermaLemma()
        self.stts = pickle.load(open(stts_path, 'rb'))
        self.german_stops = stopwords.words('german')
    def __init__(self, path: str = "src/data/", windowSize=5) -> None:
        self.path = path
        self.windowSize = windowSize

        self.df_aspect_tokens = None
        self.df_preprocessed = None
        self.df_lexicon = None

        self.lemmatizer = GermaLemma()
Exemple #3
0
    def annotate_stw(self, t, clf_class, majority_classes=None):
        """
        Method for annotating a segment with one of the classes speech, thought or writing given
        the STWR classification clf_class.

        :param t: The text of the segment.
        :param clf_class: One of direct, indirect, free_indirect, reported. The predicted class for t.
        :param majority_classes: A dictionary containing the majority classes (one of speech, thought or writing)
                                for each STWR class.
        :return: One of speech, thought or writing; the annotation for t.
        """
        # Get the stored majority classes if no other are given
        if not majority_classes:
            majority_classes = self.majority_classes

        # Direct and free_indirect should always be classified by majority classes as reporting words are more
        # likely to appear outside of segments of these classes.
        if clf_class in ['direct', 'free_indirect']:
            return majority_classes[clf_class]

        # For the other types check for reporting words with unambiguous type else use majority class
        doc = NLP(t)
        # Get lemmata with germalemma as spacy is not good at this
        lemmatizer = GermaLemma()

        lemmata = []
        for token in doc:
            if token.pos_ == "VERB":
                lemmata.append(lemmatizer.find_lemma(token.text, 'V'))

            elif token.pos_ == "NOUN":
                lemmata.append(lemmatizer.find_lemma(token.text, 'N'))

        if len(lemmata) > 0:
            stw_words_t = pd.concat([
                self.stw_words[self.stw_words["Word"].str.contains(
                    r'\b{}\b'.format(re.escape(lemma)))] for lemma in lemmata
            ],
                                    axis=0,
                                    ignore_index=True)
        else:
            stw_words_t = []

        if len(stw_words_t) == 1:
            if stw_words_t["Type"][0] in ["speech", "thought", "writing"]:
                return stw_words_t["Type"][0]
            else:
                return majority_classes[clf_class]

        else:
            return majority_classes[clf_class]
Exemple #4
0
def lemmatize_tokens(tokens):
    lemmatizer = GermaLemma()
    new_tokens = {}
    for doc_label, tok_pos in tokens.items():
        lemmata_pos = []
        for t, pos in tok_pos:
            try:
                l = lemmatizer.find_lemma(t, pos)
            except ValueError:
                l = t
            lemmata_pos.append((l, pos))
            new_tokens[doc_label] = lemmata_pos

    return new_tokens
Exemple #5
0
def custom_extensions(doc):

    lemmatizer = GermaLemma()
    negation_words = set(["nie", "keinsterweise", "keinerweise", "niemals", "nichts", "kaum", "keinesfalls", "ebensowenig", "nicht", "kein", "keine", "weder"])
    negation_cconj = set(['aber', 'jedoch', 'doch', 'sondern'])

    def lemma_getter(token):
        # if " " in token.text:
        #     return token.lemma_.lower()
        try:
            return lemmatizer.find_lemma(token.text, token.tag_).lower()
        except:
            return token.lemma_.lower()

    def is_negation_getter(token):
        if token._.lemma in negation_words:
            return True
        else:
            return False

    def is_sentence_break_getter(token):
        if token._.lemma in negation_cconj:
            return True
        else:
            return False

    Token.set_extension("lemma", getter=lemma_getter, force=True)
    Token.set_extension("is_negation", getter=is_negation_getter, force=True)
    Token.set_extension("is_sentence_break", getter=is_sentence_break_getter, force=True)
    return doc
Exemple #6
0
 def __init__(self, language="english"):
     self.language = language
     self.tagger = Tagger()
     self.stopwords = stopwords.words(language)
     if self.language == "german":
         self.lemmatizer = GermaLemma()
         self.stopwords.append('dass')
     else:
         self.lemmatizer = WordNetLemmatizer()
Exemple #7
0
class GermaLemma(PipelineModule):
    def __init__(self, pos_prereq):
        self.pos_prereq = pos_prereq
        self.lemmatizer = GermaLemma(
            tiger_corpus=
            'resources/tiger_release_aug07.corrected.16012013.conll09')

    def targets(self):
        return {'lemma-germalemma'}

    def prerequisites(self):
        return {'token', self.pos_prereq}

    def make(self, prerequisite_data):
        tokens = prerequisite_data['token']
        pos = prerequisite_data[self.pos_prereq]

        pattern1 = re.compile("^[NV]")
        pattern2 = re.compile("^(ADJ|ADV)")

        def lemmatize_token(t, postag):
            try:
                if pattern1.match(postag):
                    return self.lemmatizer.find_lemma(t, postag)
                elif pattern2.match(postag):
                    return self.lemmatizer.find_lemma(t, postag[:3])
                else:
                    return 0
            except Exception as e:
                sys.stderr.write(
                    f"Lemmatizing {t} ({postag}) raised exception: {e}\n")
                return 0

        return {
            'lemma-germalemma':
            list(map(lambda x: lemmatize_token(x[0], x[1]), zip(tokens, pos)))
        }
Exemple #8
0
def create_dictionary(doc_labels, filename):

    def pipe(label):
        doc = nlp(gendocs(label))
        res = []

        for i, sent in enumerate(doc.sents):
            for j, token in enumerate(sent):
                Token.set_extension('lemma', getter=lemma_getter, force=True)
                if not token.is_punct and not token.is_digit and not token.is_space:
                    tok = token._.lemma.lower()
                    tok = tok.replace('.', '')
                    res.append(tok)

        return res

    if os.path.isfile(filename):
        print('File already exists!')
        return

    # create gensim dict & BoW
    lemmatizer = GermaLemma()

    from src.d01_ana.analysis import load_data, gendocs
    def lemma_getter(token):
        try:
            return lemmatizer.find_lemma(token.text, token.tag_).lower()
        except:
            return token.lemma_.lower()

    # doc_labels = random.sample(doc_labels, 100)

    nlp = spacy.load("de_core_news_lg")

    docs = (pipe(label) for label in doc_labels)
    # tokens = [(token for token in doc) for doc in docs]
    tokens = ((token for token in doc) for doc in docs)
    dictionary = corpora.Dictionary()

    BoW_corpus = [dictionary.doc2bow(token, allow_update=True) for token in tokens]

    dictionary.save(filename)

    return dictionary
Exemple #9
0
"""
Tests for germalemma module.

Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung
January 2019
"""

import pytest

from germalemma import GermaLemma

lemmatizer = GermaLemma()

test_table = (
    # known nouns
    (('US-Präsident', 'N'), 'US-Präsident'),
    (('US-Präsidenten', 'N'), 'US-Präsident'),
    (('EG-Staaten', 'N'), 'EG-Staat'),
    (('EG-Staaten', 'NP'), 'EG-Staat'),
    # unknown nouns
    (('US-Präsidentenhaus', 'N'), 'US-Präsidentenhaus'),
    (('US-Präsidentenhäuser', 'N'), 'US-Präsidentenhaus'),
    (('EU-Neu-Delegierte', 'N'), 'EU-Neu-Delegierter'),
    (('Feinstaubbelastungen', 'N'), 'Feinstaubbelastung'),
    # known adjectives
    (('fies', 'ADJ'), 'fies'),
    (('besser', 'ADJ'), 'gut'),
    (('schöne', 'ADJ'), 'schön'),
    # unknown adjectives
    (('unbeschreibliches', 'ADJ'), 'unbeschreiblich'),
    (('klagloser', 'ADJ'), 'klaglos'),
Exemple #10
0
    def _task_lemmatize(self,
                        pos_tagset,
                        use_dict=False,
                        use_patternlib=False,
                        use_germalemma=None):
        tmp_lemmata = defaultdict(list)

        if use_germalemma is None and self.language == 'german':
            use_germalemma = True

        if use_germalemma:
            if not self.germalemma:
                self.germalemma = GermaLemma()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    try:
                        l = self.germalemma.find_lemma(t, pos)
                    except ValueError:
                        l = t
                    tmp_lemmata[dl].append(l)
        else:
            if use_dict and self.lemmata_dict:
                for dl, tok_tags in self._tokens.items():
                    for t, pos in tok_tags:
                        pos = simplified_pos(pos, tagset=pos_tagset)

                        if pos:
                            l = self.lemmata_dict.get(pos, {}).get(t, None)
                            if l == '-' or l == '':
                                l = None
                        else:
                            l = None
                        tmp_lemmata[dl].append(l)

            if use_patternlib:
                if not self.pattern_module:
                    if self.language not in PATTERN_SUBMODULES:
                        raise ValueError(
                            "no CLiPS pattern module for this language:",
                            self.language)

                    modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language]
                    self.pattern_module = import_module(modname)

                for dl, tok_tags in self._tokens.items():
                    tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))

                    lemmata_final = []
                    for (t, pos), t_found in zip(tok_tags, tok_lemmata):
                        l = t_found

                        if l is None:
                            if pos.startswith('NP'):  # singularize noun
                                l = self.pattern_module.singularize(t)
                            elif pos.startswith('V'):  # get infinitive of verb
                                l = self.pattern_module.conjugate(
                                    t, self.pattern_module.INFINITIVE)
                            elif pos.startswith('ADJ') or pos.startswith(
                                    'ADV'
                            ):  # get baseform of adjective or adverb
                                l = self.pattern_module.predicative(t)

                        lemmata_final.append(l)

                    tmp_lemmata[dl] = lemmata_final

        if len(tmp_lemmata) == 0:
            if not self.wordnet_lemmatizer:
                self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    wn_pos = pos_tag_convert_penn_to_wn(pos)
                    if wn_pos:
                        l = self.wordnet_lemmatizer.lemmatize(t, wn_pos)
                    else:
                        l = t
                    tmp_lemmata[dl].append(l)

        # merge
        lemmatized_tokens = {}
        for dl, tok_tags in self._tokens.items():
            tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))
            new_tok_tags = [(l or t, pos)
                            for (t, pos), l in zip(tok_tags, tok_lemmata)]
            assert len(new_tok_tags) == len(tok_tags)
            lemmatized_tokens[dl] = new_tok_tags

        assert len(lemmatized_tokens) == len(self._tokens)
        self._tokens = lemmatized_tokens
Exemple #11
0
class _PreprocWorker(mp.Process):
    def __init__(self,
                 worker_id,
                 docs,
                 language,
                 tasks_queue,
                 results_queue,
                 tokenizer,
                 stemmer,
                 lemmata_dict,
                 pos_tagger,
                 group=None,
                 target=None,
                 name=None,
                 args=(),
                 kwargs=None):
        super(_PreprocWorker, self).__init__(group, target, name, args, kwargs
                                             or {})
        logger.debug('worker `%s`: init with worker ID %d' % (name, worker_id))
        logger.debug('worker `%s`: docs = %s' % (name, str(set(docs.keys()))))
        self.worker_id = worker_id
        self.docs = docs
        self.language = language
        self.tasks_queue = tasks_queue
        self.results_queue = results_queue

        # set a tokenizer
        self.tokenizer = tokenizer  # tokenizer instance (must have a callable attribute `tokenize` with a document
        # text as argument)

        # set a stemmer
        self.stemmer = stemmer  # stemmer instance (must have a callable attribute `stem`)

        # set a POS tagger
        self.pos_tagger = pos_tagger  # POS tagger instance (must have a callable attribute `tag`)

        self.lemmata_dict = lemmata_dict
        self.pattern_module = None  # dynamically loaded CLiPS pattern library module
        self.germalemma = None  # GermaLemma instance
        self.wordnet_lemmatizer = None  # nltk.stem.WordNetLemmatizer instance

        self._tokens = {
        }  # tokens for this worker at the current processing stage. dict with document label -> tokens list
        self._ngrams = {}  # generated ngrams

        #self._filtered = False
        self._orig_tokens = None  # original (unfiltered) tokens, when filtering is currently applied

    def run(self):
        logger.debug('worker `%s`: run' % self.name)

        for next_task, task_kwargs in iter(self.tasks_queue.get, None):
            logger.debug('worker `%s`: received task `%s`' %
                         (self.name, next_task))

            exec_task_fn = getattr(self, '_task_' + next_task)
            if exec_task_fn:
                exec_task_fn(**task_kwargs)
            else:
                raise NotImplementedError("Task not implemented: `%s`" %
                                          next_task)

            self.tasks_queue.task_done()

        logger.debug('worker `%s`: shutting down' % self.name)
        self.tasks_queue.task_done()

    def _put_items_in_results_queue(self, container):
        if container:
            logger.debug('worker `%s`: putting %d results in queue' %
                         (self.name, len(container)))
            for pair in container.items():
                self.results_queue.put(pair)
        else:
            # we *have* to put something in the result queue -> signal that we return "nothing"
            logger.debug('worker `%s`: putting None in results queue' %
                         self.name)
            self.results_queue.put(None)

    def _task_get_tokens(self):
        self._put_items_in_results_queue(self._tokens)

    def _task_get_tokens_with_worker_id(self):
        self.results_queue.put((self.worker_id, self._tokens))

    def _task_get_ngrams(self):
        self._put_items_in_results_queue(self._ngrams)

    def _task_get_ngrams_with_worker_id(self):
        self.results_queue.put((self.worker_id, self._ngrams))

    def _task_get_vocab_doc_freq(self):
        counts = Counter()
        for dt in self._tokens.values():
            counts.update(set(ith_column(dt)))
        self.results_queue.put(counts)

    def _task_get_state(self):
        logger.debug('worker `%s`: getting state' % self.name)

        state_attrs = ('docs', 'language', '_tokens', '_ngrams',
                       '_orig_tokens')

        state = {attr: getattr(self, attr) for attr in state_attrs}
        logger.debug('worker `%s`: got state with %d items' %
                     (self.name, len(state)))
        self.results_queue.put(state)

    def _task_set_tokens(self, tokens):
        logger.debug('worker `%s`: setting tokens' % self.name)
        self._tokens = tokens

    def _task_set_ngrams(self, ngrams):
        logger.debug('worker `%s`: setting ngrams' % self.name)
        self._ngrams = ngrams

    def _task_set_state(self, **state):
        logger.debug('worker `%s`: setting state' % self.name)

        for attr, val in state.items():
            setattr(self, attr, val)

    def _task_tokenize(self):
        self._tokens = {
            dl: tuplize(self.tokenizer.tokenize(txt))
            for dl, txt in self.docs.items()
        }

    def _task_generate_ngrams(self, n, join=True, join_str=' '):
        self._ngrams = {
            dl: create_ngrams(ith_column(dt),
                              n=n,
                              join=join,
                              join_str=join_str)
            for dl, dt in self._tokens.items()
        }

    def _task_use_ngrams_as_tokens(self, join=False, join_str=' '):
        if join:
            new_tok = {
                dl: tuplize([join_str.join(g_tuple) for g_tuple in dg])
                for dl, dg in self._ngrams.items()
            }
        else:
            new_tok = {dl: tuplize(dg) for dl, dg in self._ngrams.items()}

        self._tokens = new_tok

    def _task_transform_tokens(self, transform_fn):
        self._tokens = {
            dl: apply_to_mat_column(dt, 0, transform_fn) if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_stem(self):
        self._tokens = {
            dl: apply_to_mat_column(dt, 0, lambda t: self.stemmer.stem(t))
            if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_pos_tag(self):
        self._tokens = {
            dl: apply_to_mat_column(
                dt, 0, self.pos_tagger.tag, map_func=False, expand=True)
            if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_lemmatize(self,
                        pos_tagset,
                        use_dict=False,
                        use_patternlib=False,
                        use_germalemma=None):
        tmp_lemmata = defaultdict(list)

        if use_germalemma is None and self.language == 'german':
            use_germalemma = True

        if use_germalemma:
            if not self.germalemma:
                self.germalemma = GermaLemma()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    try:
                        l = self.germalemma.find_lemma(t, pos)
                    except ValueError:
                        l = t
                    tmp_lemmata[dl].append(l)
        else:
            if use_dict and self.lemmata_dict:
                for dl, tok_tags in self._tokens.items():
                    for t, pos in tok_tags:
                        pos = simplified_pos(pos, tagset=pos_tagset)

                        if pos:
                            l = self.lemmata_dict.get(pos, {}).get(t, None)
                            if l == '-' or l == '':
                                l = None
                        else:
                            l = None
                        tmp_lemmata[dl].append(l)

            if use_patternlib:
                if not self.pattern_module:
                    if self.language not in PATTERN_SUBMODULES:
                        raise ValueError(
                            "no CLiPS pattern module for this language:",
                            self.language)

                    modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language]
                    self.pattern_module = import_module(modname)

                for dl, tok_tags in self._tokens.items():
                    tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))

                    lemmata_final = []
                    for (t, pos), t_found in zip(tok_tags, tok_lemmata):
                        l = t_found

                        if l is None:
                            if pos.startswith('NP'):  # singularize noun
                                l = self.pattern_module.singularize(t)
                            elif pos.startswith('V'):  # get infinitive of verb
                                l = self.pattern_module.conjugate(
                                    t, self.pattern_module.INFINITIVE)
                            elif pos.startswith('ADJ') or pos.startswith(
                                    'ADV'
                            ):  # get baseform of adjective or adverb
                                l = self.pattern_module.predicative(t)

                        lemmata_final.append(l)

                    tmp_lemmata[dl] = lemmata_final

        if len(tmp_lemmata) == 0:
            if not self.wordnet_lemmatizer:
                self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    wn_pos = pos_tag_convert_penn_to_wn(pos)
                    if wn_pos:
                        l = self.wordnet_lemmatizer.lemmatize(t, wn_pos)
                    else:
                        l = t
                    tmp_lemmata[dl].append(l)

        # merge
        lemmatized_tokens = {}
        for dl, tok_tags in self._tokens.items():
            tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))
            new_tok_tags = [(l or t, pos)
                            for (t, pos), l in zip(tok_tags, tok_lemmata)]
            assert len(new_tok_tags) == len(tok_tags)
            lemmatized_tokens[dl] = new_tok_tags

        assert len(lemmatized_tokens) == len(self._tokens)
        self._tokens = lemmatized_tokens

    def _task_expand_compound_tokens(self,
                                     split_chars=('-', ),
                                     split_on_len=2,
                                     split_on_casechange=False):
        tmp_tokens = {}
        for dl, dt in self._tokens.items():
            nested = [
                expand_compound_token(tup[0], split_chars, split_on_len,
                                      split_on_casechange) for tup in dt
            ]
            tmp_tokens[dl] = tuplize(flatten_list(nested))

        self._tokens = tmp_tokens

    def _task_remove_special_chars_in_tokens(self, special_chars):
        self._tokens = {
            dl: apply_to_mat_column(
                dt,
                0,
                lambda x: remove_special_chars_in_tokens(x, special_chars),
                map_func=False) if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_clean_tokens(self,
                           tokens_to_remove,
                           save_orig_tokens=False,
                           remove_shorter_than=None,
                           remove_longer_than=None,
                           remove_numbers=False):
        if save_orig_tokens:
            self._save_orig_tokens()

        if remove_shorter_than is not None:
            self._tokens = {
                dl: [t for t in dt if len(t[0]) >= remove_shorter_than]
                for dl, dt in self._tokens.items()
            }

        if remove_longer_than is not None:
            self._tokens = {
                dl: [t for t in dt if len(t[0]) <= remove_longer_than]
                for dl, dt in self._tokens.items()
            }

        if remove_numbers:
            self._tokens = {
                dl: [t for t in dt if not t[0].isnumeric()]
                for dl, dt in self._tokens.items()
            }

        if type(
                tokens_to_remove
        ) is not set:  # using a set is much faster than other sequence types for "in" tests
            tokens_to_remove = set(tokens_to_remove)

        self._tokens = {
            dl: [t for t in dt if t[0] not in tokens_to_remove]
            for dl, dt in self._tokens.items()
        }

    def _task_filter_for_token(self,
                               search_token,
                               match_type='exact',
                               ignore_case=False,
                               glob_method='match',
                               remove_found_token=False):
        self._save_orig_tokens()

        self._tokens = filter_for_token(self._tokens,
                                        search_token,
                                        match_type=match_type,
                                        ignore_case=ignore_case,
                                        glob_method=glob_method,
                                        remove_found_token=remove_found_token,
                                        remove_empty_docs=False)

    def _task_filter_for_pos(self,
                             required_pos,
                             pos_tagset,
                             simplify_pos=True):
        self._save_orig_tokens()
        self._tokens = filter_for_pos(self._tokens,
                                      required_pos,
                                      simplify_pos=simplify_pos,
                                      simplify_pos_tagset=pos_tagset)

    def _task_reset_filter(self):
        self._tokens = self._orig_tokens
        self._orig_tokens = None

    def _save_orig_tokens(self):
        if self._orig_tokens is None:  # initial filtering -> safe a copy of the original tokens
            self._orig_tokens = deepcopy(self._tokens)
# POS tagging (time-consuming!)
#TODO: maybe use faster POS-tagging, e.g. NLTK tagger or ClassifierBasedGermanTagger using TIGER corpus, but spacy has higher accuracy
nlp = spacy.load('de_core_news_md', disable=['ner', 'parser'])
df_articles['Article_POS'] = df_articles['Article'].apply(lambda x: nlp(x))

# Create new column including only nouns (all noun types from STTS tagset)
df_articles['Nouns'] = df_articles['Article_POS'].apply(
    lambda x: [token for token in x if token.tag_.startswith('NN')])

# remove words with length==1
df_articles['Nouns'] = df_articles['Nouns'].apply(
    lambda x: [word for word in x if len(x) > 1])
# df_articles['Nounverbs'] = df_articles['Nounverbs'].apply(lambda x: [word for word in x if len(x)>1])

# Lemmatization
lemmatizer = GermaLemma()

# Lemmatization of Nouns
noun_list = df_articles['Nouns'].tolist()

global noun_lemma_list
noun_lemma_list = []
for doc in noun_list:
    noun_lemma_list.append([])
    for token in doc:
        token_lemma = lemmatizer.find_lemma(token.text, token.tag_)
        token_lemma = token_lemma.lower()
        noun_lemma_list[-1].append(token_lemma)

# Save to help df
df_help_noun_lemma_list = pandas.DataFrame({'x': noun_lemma_list})
Exemple #13
0
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from germalemma import GermaLemma
import pickle
from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger

lemmatizer = GermaLemma()

# passing the word and the POS tag ("N" for noun)
with open('data/pos.pickle', 'rb') as f:
    tagger = pickle.load(f)

pos = tagger.tag(['Jungen', u'Wände', u'Wänden'])
print(pos)
for item in pos:
    w, p = item
    print(lemmatizer.find_lemma(w, p))
#lemma = lemmatizer.find_lemma(u'Jungen', u'N')
#print(lemma)
Exemple #14
0
def analysis(doc_labels):

    def gendocs(label):
        with open('data/corpus_clean/{}.txt'.format(label), "r") as text_file:
            return text_file.read()

    # %%

    # %%
    nlp = spacy.load("de_core_news_lg")

    # %%

    lemmatizer = GermaLemma()

    def lemma_getter(token):
        try:
            return lemmatizer.find_lemma(token.text, token.tag_)
        except:
            return token.lemma_

    def is_neg_elite(token):
        global found

        if token._.is_elite_noneg:
            found.append((token.text, None))
            return True

        elif token._.is_elite:
            check = list(token.children)
            # if token.head:
            #     check.append(token.head)
            node = token
            while node.head:
                seen = node
                if seen == node.head:
                    break
                else:
                    check.append(node)
                    node = seen.head
            attr_neg = [child for child in check if child._.lemma.lower() in negativ]
            if attr_neg:
                found.append((token.text, attr_neg))
                return True
            else:
                return False
            # return any([True for child in check if child._.lemma.lower() in negativ])
        else:
            return False


    def is_volk(token):
        global found
        # if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            # print(token._.lemma)

        check = list(token.children)

        if token._.lemma.lower() in people:
            found.append((token.text, None))
            return True

        elif token._.lemma.lower() in people_ordinary:
            attr_ppl = [child for child in check if child._.lemma.lower() in attribut_ordinary]
            if attr_ppl:
                found.append((token.text, attr_ppl))
                # print('found attr_ppl')
                return True
            else:
                return False

        elif token._.lemma.lower() in people_ger:
            attr_ger = [child for child in check if child._.lemma.lower() in attribut_ger]
            if attr_ger:
                found.append((token.text, attr_ger))
                # print('found ppl_ger')
                return True
            else:
                return False

        else:
            return False


    people = set(people)
    people_ordinary = set(people_ordinary)
    people_ger = set(people_ger)
    attr_ger = set(attribut_ger)
    elite = [*elite_pol, *elite_eco, *elite_experten, *elite_medien]
    elite = set(elite)
    elite_noneg = set(elite_noneg)

    negativ = set(neg_dict.keys())
    positiv = set(pos_dict.keys())

    dfs = []
    all_sents = []
    res = []

    # doc_labels = doc_labels[1000:1500]
    # doc_labels = random.sample(doc_labels, 100)

    for label in tqdm(doc_labels):

        res_dict = {'doc': None, 'len': None, 'pop': False, 'volk': 0, 'elite': 0, 'sents': None, 'volk_': None, 'elite_': None, 'lemma_pop': None}

        found = []
        doc = nlp(gendocs(label))
        hits = {'volk': [], 'elite': []}
        for i, sent in enumerate(doc.sents):
            # print(sent)
            for j, token in enumerate(sent):
                # is_volk_getter = lambda token: token._.lemma.lower() in volk
                is_elite_getter = lambda token: token._.lemma.lower() in elite
                is_elite_noneg_getter = lambda token: token._.lemma.lower() in elite_noneg
                is_neg_getter = lambda token: token._.lemma.lower() in negativ
                is_pos_getter = lambda token: token._.lemma.lower() in positiv

                Token.set_extension('is_neg', getter=is_neg_getter, force=True)
                Token.set_extension('is_pos', getter=is_pos_getter, force=True)
                Token.set_extension('is_elite', getter=is_elite_getter, force=True)
                Token.set_extension('is_elite_noneg', getter=is_elite_noneg_getter, force=True)
                Token.set_extension('lemma', getter=lemma_getter, force=True)

                is_volk_getter = lambda token: is_volk(token)
                is_neg_elite_getter = lambda token: is_neg_elite(token)

                Token.set_extension('is_volk', getter = is_volk_getter, force=True)
                Token.set_extension('is_neg_elite', getter = is_neg_elite_getter, force=True)

                if token._.is_volk:
                    hits['volk'].append(token._.lemma)

                if token._.is_neg_elite:
                    hits['elite'].append(token._.lemma)
                    all_sents.append(sent)

                # Token.set_extension('is_pos_volk', getter=is_pos_volk_getter_func, force=True)

                # print(token.text, token.lemma_, token._.lemma, token.pos_)
                # print(list(token.children))
        # print(found)

        matcher = Matcher(nlp.vocab)
        pattern = [{'_': {'is_neg_elite': True}}]
        matcher.add('text', None, pattern)
        matches = matcher(doc)
        has_pop = []
        tokens_pop = []
        for match_id, start, end in matches:
            span = doc[start-280:end+280]

            for token in span:
                if token._.is_volk:

                    tokens_pop.append(doc[start]._.lemma)
                    tokens_pop.append(token._.lemma)
                    sentence_start = span[0].sent.start
                    sentence_end = span[-1].sent.end
                    has_pop.append(doc[sentence_start : sentence_end].text)

        c_volk = Counter(([token._.is_volk for token in doc]))
        c_neg_elite = Counter(([token._.is_neg_elite for token in doc]))
        tokens_pop_counter = Counter(tokens_pop)

        if has_pop:
            res_dict['pop'] = True
        res_dict['doc'] = label
        res_dict['sents'] = has_pop
        res_dict['elite'] = c_neg_elite[True]
        res_dict['volk'] = c_volk[True]
        res_dict['len'] = len(doc)
        res_dict['volk_'] = hits['volk']
        res_dict['elite_'] = hits['elite']
        res_dict['volk_counter'] = Counter(hits['volk'])
        res_dict['elite_counter'] = Counter(hits['elite'])
        res_dict['hits'] = found
        res_dict['lemma_pop'] = tokens_pop_counter
        res.append(res_dict)
Exemple #15
0
print("running 10 randomized evaluations")
pct_success_all_trials = []
incorrect_lemmata = []
known_incorrect_lemmata_tokens = set()
for _ in range(10):
    shuffle(all_tokens)

    n_split = int(len(all_tokens) * 0.9)
    tokens_a, tokens_b = all_tokens[:n_split], all_tokens[n_split:]

    # build lemmatizer with tokens_a

    lemmata = defaultdict(dict)
    lemmata_lower = defaultdict(dict)
    for token, lemma, pos in tokens_a:
        GermaLemma.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma,
                                        pos)

    lemmatizer = GermaLemma(lemmata=lemmata, lemmata_lower=lemmata_lower)

    # test lemmatizer with tokens_b

    n_success = 0
    for token, true_lemma, pos in tokens_b:
        found_lemma = lemmatizer.find_lemma(token, pos)
        if found_lemma == true_lemma:
            n_success += 1
        elif found_lemma != token and token not in known_incorrect_lemmata_tokens:
            incorrect_lemmata.append((token, found_lemma, true_lemma))
            known_incorrect_lemmata_tokens |= {token}

    n_all = len(tokens_b)
Exemple #16
0
def main():
    # train
    if os.path.exists('./resources/nltk_german_classifier_data.pickle'):
        with open('./resources/nltk_german_classifier_data.pickle', 'rb') as f:
            print('./resources/nltk_german_classifier_data.pickle found')
            tagger = pickle.load(f)
    else:
        print(
            'could not find ./resources/nltk_german_classifier_data.pickle: training: IN PROGRESS'
        )
        tagger = train()
        with open('./resources/nltk_german_classifier_data.pickle', 'wb') as f:
            pickle.dump(tagger, f, protocol=2)
        print('training FINISHED')

    # tokenize
    if os.path.exists('./data/1.pickle'):
        with open('./data/1.pickle', 'rb') as f:
            print('1.pickle found')
            words = pickle.load(f)
    else:
        print('could not find 1.pickle: tokenizing: IN PROGRESS')
        document = open('./resources/logik-band-eins.txt').read()
        tok = Tokenizer()
        tokens = tok.tokenize(document)

        words = []
        i = 0
        for token in tokens:
            if i < 10000:
                v = token.value
                if len(v) > 1 and (not str.isdigit(v)) or True:
                    words.append(v)
                # i = i + 1
            else:
                break
        with open('./data/1.pickle', 'wb') as f:
            pickle.dump(words, f, protocol=2)
        print('tokenizing FINISHED')

    # tag
    if os.path.exists('./data/2.pickle'):
        with open('./data/2.pickle', 'rb') as f:
            print('2.pickle found')
            tagged_words = pickle.load(f)
    else:
        print('could not find 2.pickle: tagging: IN PROGRESS')
        tagged_words = tagger.tag(words)
        with open('./data/2.pickle', 'wb') as f:
            pickle.dump(tagged_words, f, protocol=2)

    # filter-in As, Ns, and Vs
    if os.path.exists('./data/3.pickle'):
        with open('./data/3.pickle', 'rb') as f:
            print('3.pickle found')
            filtered_words = pickle.load(f)
    else:
        print('could not find 3.pickle: filtering: IN PROGRESS')
        parts_of_speech = [
            'ADJA',
            'ADJD',
            'NN',
            'NN',
        ]
        filtered_words = list(
            filter(
                lambda word: word[1][0] == 'V' or any(
                    pos == word[1] for pos in parts_of_speech), tagged_words))
        with open('./data/3.pickle', 'wb') as f:
            pickle.dump(filtered_words, f, protocol=2)

    # lemmatize
    if os.path.exists('./data/4.pickle'):
        with open('./data/4.pickle', 'rb') as f:
            print('4.pickle found')
            lemmatized_words = pickle.load(f)
    else:
        print('could not find 4.pickle: lematization: IN PROGRESS')
        lemmatizer = GermaLemma()
        lemmatized_words = []
        for word in filtered_words:
            try:
                lemmatized_words.append(lemmatizer.find_lemma(
                    word[0], word[1]))
            except:
                w = word[0]
                l = word[1]
                print(f"EXCEPT: {w} {l}")
                continue
        with open('./data/4.pickle', 'wb') as f:
            pickle.dump(lemmatized_words, f, protocol=2)

    # filter-out modals
    f = open('./resources/modal-words.txt', 'r')
    modal_words = f.read().splitlines()[:1000]
    non_modals = [item for item in lemmatized_words if item not in modal_words]
    # non_modals = list(filter(lambda word: not any(modal == word for modal in modals), lemmatized_words))
    # modals = []
    # line = f.readline()
    # modals.append(line)
    # while line:
    #     line = f.readline()
    #     modals.append(line)

    for pair in Counter(non_modals).most_common(30):
        print(pair[0] + " " + str(pair[1]))
Exemple #17
0
class STWRFeatureExtractor(object):
    """
    Feature extractor for classifiying STWR.
    """

    def __init__(self, sequence_features=True):
        """
        :param sequence_features: If true, use the sequence features (trained on gold labels).
        """

        # Number of features
        self.num_features = 243
        # Names of features - needed for feature inspection
        self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM",
                              "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA",
                              "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS",
                              "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN",
                              "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT",
                              "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE",
                              "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART",
                              "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY",
                              "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF",
                              "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS",
                              "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP",
                              "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL",
                              "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV",
                              "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ",
                              "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN",
                              "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end",
                              "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes",
                              "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3",
                              "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind",
                              "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden",
                              "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker",
                              "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1",
                              "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1",
                              "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun",
                              "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2",
                              "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1",
                              "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5",
                              "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5",
                              "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5",
                              "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1",
                              "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5",
                              "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4",
                              "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2",
                              "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1",
                              "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5",
                              "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1",
                              "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5",
                              "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3",
                              "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1",
                              "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5",
                              "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4",
                              "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0",
                              "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4",
                              "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2",
                              "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5",
                              "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg",
                              "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect",
                              "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported",
                              "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported",
                              "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars",
                              "paragraph", "prev_paragraph"]

        # Switch to turn off sequence features
        self.sequence_features = sequence_features
        if not self.sequence_features:
            self.feature_names = self.feature_names[:-21] + self.feature_names[-8:]

        # Get all possible tags
        self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys())
        self.pos_map = sorted(spacy.parts_of_speech.NAMES.values())
        # Set up lemmatizer
        self.lemmatizer = GermaLemma()
        # Set up RFTagger
        call(["make"], cwd="RFTagger/src")
        # Load word vectors
        print("Loading word-vectors. This may take a while ...")
        self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True)
        print("Done.\n")

    def transform(self, text, original_text = None, backlog=[]):
        """
        Method that transforms the given segments into their feature representation.
        Expects dataframe with column ["text"] or list of spacy tokens along with the original text or string.

        :param text: dataframe with column ["text"] that contains the string segments or list of spacy tokens.
        :param original_text: the original text as string is passed in test mode.
        :param backlog: For test mode, the backlog stores info and labels of former segments and
                        therefore has to be passed back and forth between classifier and feature extractor.
        :return: The transformed segments as pandas Dataframe or list, depending on the type of 'text'
        """

        # If the backlog has not been initialized, initialize it
        if len(backlog) == 0:
            backlog = ["" for i in range(10)] + [0 for i in range(64)]

        # If spacy tokenization and quote annotation has not been performed, do it now
        if type(text) == list:
            tokens = text

        elif type(text) == pd.DataFrame:
            # Get full text for better results in spacy parsing
            full_text = " ".join(text['text'].values)

            doc = NLP(full_text)
            # Exchange tags for quotation marks for special tokens: #OPEN_QUOTE#, #CLOSE_QUOTE#
            doc = annotate_quotes(doc)
            tokens_full_text = [token for token in doc]

        # Transform individual segments
        if type(text) == list:
            return self.transform_segment(tokens, backlog, original_text)

        else:
            output = pd.DataFrame()
            print("Extracting features...")
            for ind, row in text.iterrows():
                # print progress bar
                sys.stdout.write('\r')
                # the exact output you're looking for:
                sys.stdout.write("[%-20s] %d%%" % ('=' * round(ind/(len(text)/20)), round(ind/(len(text)/100))))
                sys.stdout.flush()

                # Get the tokens corresponding to the segment:
                tokens_text = string_tokenize(row['text'])
                tokens = tokens_full_text[:len(tokens_text)]

                # Check that this is correct
                assert tokens_text[-1] == tokens[-1].text
                tokens_full_text = tokens_full_text[len(tokens_text):]

                transformed, backlog = self.transform_segment(tokens, backlog, row['text'])
                output = output.append(pd.Series(transformed), ignore_index = True)

                # Adapt backlog: backlog stores last ten classifications in the first ten positions
                backlog[0:10] = backlog[1:10] + [row['labels_spans']]

            return output, backlog

    def transform_segment(self, tokens, backlog, original_text):
        """
        Transforms an individual segment of tokens, given the information in the backlog,
        into a feature representation.

        :param tokens: list of spacy tokens
        :param backlog: list containing information about the labels and other features of previous segments
        :param original_text: The original text as string
        :return: the feature representation and the updated backlog
        """

        # --- Preprocessing ---
        transformed = []
        token_strings = [token.text for token in tokens]
        # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV
        token_lemmata = []
        for token in tokens:
            if token.pos_ == "VERB":
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'V'))
            elif token.pos_ == "NOUN":
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'N'))
            elif token.pos_ in ["ADJ", "ADV"]:
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, token.pos_))
            else:
                token_lemmata.append(token.text)

        # Load reporting word list
        stw_words_orig = pd.read_excel("data/stw_words/stw_words_brunner2015.xls")
        # Some words are only usable for reported class
        stw_words_rep = stw_words_orig[stw_words_orig['Marker'] == 'rep']
        stw_words = stw_words_orig[stw_words_orig['Marker'] != 'rep']

        # Do deeper morphological analysis with RFTagger
        file = open("RFTagger/temp.txt", "w")
        file.write("\n".join(token_strings))
        file.close()
        morph_tagged = check_output(["src/rft-annotate", "lib/german.par", "temp.txt"], cwd="RFTagger", stderr=FNULL).decode(
            "utf-8").split("\n")
        # Split morph tags into attributes
        morph_tagged = [morph_tag.split("\t")[1].split(".") if morph_tag != "" else morph_tag for morph_tag in morph_tagged]

        # --- Pos tag features ---
        tags = [token.tag_ for token in tokens]
        pos = [token.pos_ for token in tokens]
        transformed += [(tags.count(tag)/len(tags)) if tag in tags else 0 for tag in self.tag_map]
        transformed += [(pos.count(p) / len(pos)) if p in pos else 0 for p in self.pos_map]

        # --- NE features ---
        doc = NLP(original_text)
        transformed.append(len(doc.ents))
        for ne_type in NE_TYPES:
            transformed.append(int(len([ent for ent in doc.ents if ent.label_ == ne_type]) > 0))

        # --- Special token features ---
        # Colon in this or in previous segment?
        colon_this = int(":" in token_strings)
        transformed.append(colon_this)
        transformed.append(backlog[10])
        # Comma at the end of this segment means that the next segment is an embedded sentence if it has a verb
        comma_end = int(tags[-1] == '$,')
        transformed.append(comma_end)

        # Percentage of 'emphatic' punctuation marks: ?,!,-,–
        transformed.append((token_strings.count('?') + token_strings.count('!') + token_strings.count('-') + token_strings.count('–'))/len(token_strings))
        # Question?
        transformed.append(int((token_strings.count('?') > 0)))

        # Quotes features
        # Opening Quotes in this segment?
        open_quote = len([tag for tag in tags if tag == "#OPEN_QUOTE#"])
        # Closing Quotes in this segment?
        close_quote = len([tag for tag in tags if tag == "#CLOSE_QUOTE#"])
        # In quotes?
        in_quotes = int(backlog[11] > 0 or open_quote > 0)
        transformed.append(open_quote)
        transformed.append(close_quote)
        transformed.append(in_quotes)
        # How many contiguous prev. segments have been in quotes so far? This is meant to tackle errors bc of missing closing quotes
        # as well as marking sequences of embedded narration
        transformed.append(backlog[49])

        # Special combinations direct - full quoted sentence (sent. ending punct. before closing quotes),
        # comma after closing quotes (prob. frame of direct speech)
        transformed.append(int(len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i > 0 and tags[i-1] == "$."]) > 0))
        transformed.append(int((backlog[12] == 1 and token_strings[0] == ",") or (len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i < len(token_strings)-1 and token_strings[i+1] == ","]) > 0)))

        # --- Morphological Features ---
        # percentage of first and second person pronouns (personal, possessive, reflexive)
        per1 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '1']
        per2 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '2']
        per12 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] in ['1', '2']]
        transformed.append(len(per1) / len(token_strings))
        # Second person might be a better feature than 1. and 2. together as it is seldom the perspective of a narrative
        transformed.append(len(per2) / len(token_strings))
        transformed.append(len(per12)/len(token_strings))
        # percentage of third person pronouns (personal, possessive, reflexive)
        per3 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '3']
        transformed.append(len(per3) / len(token_strings))

        # Note changes in the usage of person; this might help to distinguish between third and first person perspective narratives
        # Only third person in prev. five segments?
        transformed.append(int(len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b in ['1', '1_3']]) == 0))
        # Only first person in prev. five segments?
        transformed.append(int(len([b for b in backlog[43:48] if b == '1']) > 0 and len([b for b in backlog[43:48] if b in ['3', '1_3']]) == 0))
        # Mixed first and third person in prev. five segments
        transformed.append(int(len([b for b in backlog[43:48] if b == '3_1']) > 0 or (len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b == '1']) > 0)))

        # tempus and modus features
        has_ind = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[5] == 'Ind']) > 0)
        has_subj = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[5] == 'Subj']) > 0)
        no_subj = int(not any([morph_tag[5] == 'Subj' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        no_ind = int(not any([morph_tag[5] == 'Ind' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        has_pres = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[4] == 'Pres']) > 0)
        has_past = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[4] == 'Past']) > 0)
        no_past = int(not any([morph_tag[4] == 'Past' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        no_pres = int(not any([morph_tag[4] == 'Pres' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        for feature in [has_ind, has_subj, no_subj, no_ind, has_pres, has_past, no_past, no_pres]:
            transformed.append(feature)

        # --- Grammatical features ---
        # Comma at the end of the prev. segment means that this segment is an embedded sentence if it has a verb
        if backlog[13] and any([tag in ['VFIN', 'VAFIN'] for tag in tags]):
            transformed.append(1)
        else:
            transformed.append(0)
        # A form of verb 'würden' + infinitive can be a pointer towards free indirect
        transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata])
                               and any(
            [(tag in ['VAINF', 'VMINF', 'VVINF', 'VVIZU'] and token_lemmata[i] != 'würden') for i, tag in
             enumerate(tags)])))
        transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata])))

        # Noun/prepositional complements of a rep. word point toward reported STW,
        # sentence/infinitive complements point towards indirect STW
        all_stw_words = [token for i,token in enumerate(tokens) if any(stw_words_orig["Word"].str.contains(r'\b{}\b'.format(re.escape(token_lemmata[i]))))]
        has_prep_noun_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.pos_ in ['ADP', 'PROPN', 'NOUN'] and child.dep_.startswith('o')]) > 0]) > 0)
        has_claus_inf_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.dep_ == 'oc']) > 0]) > 0)
        transformed.append(has_prep_noun_comp)
        transformed.append(has_claus_inf_comp)

        # --- Possible speaker features ---
        # Is subject a pronoun, a person NE or a "Person" head noun -> possible speaker
        cand_speakers = [tokens[i] for i,tag in enumerate(tags) if (tag in['PPER', 'PIS', 'PDS'] or (tag in ['NE', 'NNE'] and 'PER' in [ent for ent in doc.ents if tokens[i].idx >= ent.start and tokens[i].idx <= ent.end]))]

        # Check whether any noun phrase has a head that is a synset of "Person" in Germanet
        person = []
        with open('data/person.txt', 'r', encoding='utf-8') as f:
            for l in f:
                person.append(l)

        for np in doc.noun_chunks:
            if np.root.text in person:
                cand_speakers.append(np.root)

        subj_cand_speaker = [token for token in cand_speakers if token.dep_ == 'sb']
        # How many possible speakers/addressees are there in relation to the segment length?
        num_cand_speaker = len(cand_speakers)/len(tokens)
        transformed.append(int(len(subj_cand_speaker) > 0))
        transformed.append(num_cand_speaker)
        # Append prev. segments candidate speaker features
        transformed.append(backlog[38])
        transformed.append(backlog[39])

        # --- Reporting word features ---
        # Appearance of reporting word by penalty
        has_rep_word_0 = int(any([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_1 = int(any([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_2 = int(any([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_3 = int(any([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_4 = int(any([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_5 = int(any([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))

        # Appearance of reporting word lower or equal a certain penalty
        has_rep_word_le_1 = int(any([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_2 = int(any([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_3 = int(any([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_4 = int(any([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_5 = int(any([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        # Appearance of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect'
        has_rep_word_noun = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata]))
        has_rep_word_verb = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata]))
        for feature in [has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5,
                        has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5,
                        has_rep_word_noun, has_rep_word_verb]:
            transformed.append(feature)

        # Appearance of special reporting words for reported class by penalty
        has_spec_rep_word_0 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))

        # Appearance of special reporting words lower or equal a certain penalty
        has_spec_rep_word_le_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        
        for feature in [has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4,
                        has_spec_rep_word_5,
                        has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4,
                        has_spec_rep_word_le_5]:
            transformed.append(feature)

        # Number of reporting word by penalty
        num_rep_word_0 = sum([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_1 = sum([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_2 = sum([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_3 = sum([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_4 = sum([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_5 = sum([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        # Number of reporting word lower or equal a certain penalty
        num_rep_word_le_1 = sum([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_2 = sum([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_3 = sum([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_4 = sum([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_5 = sum([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        # Number of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect'
        num_rep_word_noun = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata])
        num_rep_word_verb = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata])
        for feature in [num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4,
                        num_rep_word_5,
                        num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4,
                        num_rep_word_le_5,
                        num_rep_word_noun, num_rep_word_verb]:
            transformed.append(feature)

        # Number of special reporting words for reported class by penalty
        num_spec_rep_word_0 = sum([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_1 = sum([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_2 = sum([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_3 = sum([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_4 = sum([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_5 = sum([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        # Number of special reporting words lower or equal a certain penalty
        num_spec_rep_word_le_1 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_2 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_3 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_4 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_5 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        for feature in [num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3,
                        num_spec_rep_word_4,
                        num_spec_rep_word_5,
                        num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3,
                        num_spec_rep_word_le_4,
                        num_spec_rep_word_le_5]:
            transformed.append(feature)
            
        # Reporting word features prev. segment
        for feature in backlog[14:38]:
            transformed.append(feature)
        for feature in backlog[50:74]:
            transformed.append(feature)

        # Word vectors
        # Get prototypical word vector for reporting words
        proto_rep_vec = numpy.average([self.wordvecs[word] for word in stw_words[stw_words["Penalty"] == 0] if word in self.wordvecs], axis=0)
        # Get prototypical word vector for reported class
        proto_rep_vec_reporting = numpy.average([self.wordvecs[word] for word in stw_words_rep[stw_words_rep["Penalty"] == 0] if word in self.wordvecs], axis=0)
        # Append highest similarity values to proto word vectors within the segment
        max_sim = .0
        max_sim_rep = .0
        for lemma in token_lemmata:
            if lemma in self.wordvecs:
                lemma_vec = self.wordvecs[lemma]
                # cosine similarity = 1 - cosine distance
                sim = 1 - distance.cosine(lemma_vec, proto_rep_vec)
                sim_rep = 1 - distance.cosine(lemma_vec, proto_rep_vec_reporting)

                if sim > max_sim:
                    max_sim = sim
                if sim_rep > max_sim_rep:
                    max_sim_rep = sim_rep

        transformed.append(max_sim)
        transformed.append(max_sim_rep)

        # --- Other word features ---
        # Usage of deictic words can point to character speech - precentage of deictic words
        transformed.append(len([t for t in token_strings if t in DEICTIC])/len(token_strings))
        # Usage of special conjunction at the beginning of the segment can point to indirect
        transformed.append(int(token_strings[0] in CONJUNCT))
        # Usage of modal particles can point towards character speech
        transformed.append(len([t for t in token_strings if t in MODAL_PART])/len(token_strings))
        # Negation?
        transformed.append(len([lemma for lemma in token_lemmata if lemma in NEG])/len(token_strings))

        # Words describing facial expressions, gestures, voice might hint towards STWR
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in FACIAL]) > 0))
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in GESTURE]) > 0))
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in VOICE]) > 0))

        # The repetition of words can hint towards figural speech
        transformed.append(int(any([count >= 2 for count in [token_lemmata.count(el) for el in token_lemmata]])))

        # --- Sequential features ---

        if self.sequence_features:

            # Labels of prev. segment
            labels_last = [l for i,l in enumerate(backlog[9].split(",")) if i%3==0]
            transformed.append(int(any([l.startswith('direct') for l in labels_last])))
            transformed.append(int(any([l.startswith('indirect') for l in labels_last])))
            transformed.append(int(any([l.startswith('free_indirect') for l in labels_last])))
            transformed.append(int(any([l.startswith('reported') for l in labels_last])))
            # Label appears in 5 prev. segments
            labels_last_5 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[5:10]] for fin_l in ls]
            transformed.append(int(any([l.startswith('direct') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('indirect') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('free_indirect') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('reported') for l in labels_last_5])))
            # How many labels for each class and overall within the last 10 segments
            labels_last_10 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[0:10]] for fin_l in ls if fin_l != ""]
            transformed.append(len([l for l in labels_last_10 if l.startswith('direct')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('indirect')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('free_indirect')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('reported')]))
            transformed.append(len(labels_last_10))

        # --- Other features ---
        # Segment and character lengths
        transformed.append(len(token_strings))
        transformed.append(len(original_text))
        # Segment and character lengths of prev. segment
        transformed.append(backlog[40])
        transformed.append(backlog[41])
        # Segment and character lengths of this + prev. segment
        transformed.append(len(token_strings) + backlog[40])
        transformed.append(len(original_text) + backlog[41])
        # Is this segment at the start or end of a paragraph?
        paragraph_end = int("<p>" in original_text)
        transformed.append(paragraph_end)
        transformed.append(backlog[42])

        # --- Update Backlog ---
        # [0:10] encode labels of previous ten segments -> updated elsewhere
        # 10: Colon in prev. segment
        backlog[10] = colon_this
        # 11: How many open quotes
        backlog[11] += open_quote
        if backlog[11] - close_quote >= 0:
            backlog[11] -= close_quote
        else:
            backlog[11] = 0
        # 12: Prev. segment ends with close_quote
        backlog[12] = int(tags[-1] == "#CLOSE_QUOTE#")
        # 13: Comma at the end of this segment
        backlog[13] = comma_end
        # [14:38] reportin word appearance features prev. segment
        for i, feature in enumerate([has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4,
                                     has_rep_word_5,
                                     has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4,
                                     has_rep_word_le_5,
                                     has_rep_word_noun, has_rep_word_verb,
                                     has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3,
                                     has_spec_rep_word_4, has_spec_rep_word_5,
                                     has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3,
                                     has_spec_rep_word_le_4, has_spec_rep_word_le_5
            ]):
            backlog[14 + i] = feature
        # 38: Candidate speakers as subject
        backlog[38] = int(len(subj_cand_speaker) > 0)
        # 39: Percentage of candidate speakers
        backlog[39] = num_cand_speaker
        # 40, 41: lengths of prev. segment
        backlog[40] = len(token_strings)
        backlog[41] = len(original_text)
        # 42: paragraph end
        backlog[42] = paragraph_end

        # [43:48]: keep track of pronoun person appearances in the 5 prev. segments
        backlog[43:47] = backlog[44:48]
        if per3:
            if per1:
                backlog[48] = '3_1'
            else:
                backlog[48] = '3'
        elif per1:
            backlog[48] = '1'
        else:
            backlog[48] = '-'

        # 49: How many contiguous prev. segments have been in quotes?
        if in_quotes:
            backlog[49] += 1
        else:
            backlog[49] = 0

        # [50:74] reportin word count features prev. segment
        for i, feature in enumerate([num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4,
                                     num_rep_word_5,
                                     num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4,
                                     num_rep_word_le_5,
                                     num_rep_word_noun, num_rep_word_verb,
                                     num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2,
                                     num_spec_rep_word_3,
                                     num_spec_rep_word_4, num_spec_rep_word_5,
                                     num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3,
                                     num_spec_rep_word_le_4, num_spec_rep_word_le_5
                                     ]):
            backlog[50 + i] = feature

        return transformed, backlog
    match_sd = series.std()
    match_se = match_sd / sqrt(len(eval_df))
    ci_upper = match_mean + 1.96 * match_se
    ci_lower = match_mean - 1.96 * match_se

    return match_mean * 100, ci_lower * 100, ci_upper * 100


print("loading data...")

eval_df = pd.read_csv('eval_table/eval_table_lemmata.csv')
eval_df = eval_df.loc[~eval_df.lemma.isna(), :]

print('loaded %d rows' % len(eval_df))

lemmatizer = GermaLemma()

eval_df['germalemma'] = eval_df.apply(lambda row: lemmatizer.find_lemma(row[3], row[2]), axis=1)

eval_df['match'] = eval_df.lemma == eval_df.germalemma
eval_df.head()

print('wrong lemmata:')
print(eval_df.loc[~eval_df.match, ['token', 'pos', 'lemma', 'germalemma']])

match_mean, ci_lower, ci_upper = get_mean_and_ci(eval_df.match)

print('Success rate for germalemma: %.2f%% (95%% CI: [%.2f%%, %.2f%%])' % (match_mean, ci_lower, ci_upper))

eval_df['pattern'] = eval_df.apply(lambda row: lemma_via_patternlib(row[3], row[2]), axis=1)
eval_df['match_pattern'] = eval_df.lemma == eval_df.pattern
Exemple #19
0
def comment_to_topic(comment):

    # load and define stuff
    lemmatizer = GermaLemma()
    lemmas = []
    remove = [
        line.rstrip('\n')
        for line in open('reviews/add-stopwords.txt', encoding="utf-8")
    ]
    stop = stopwords.words('german')
    exclude_words = remove + stop
    exclude = {
        '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
        '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
        '{', '|', '}', '~'
    }

    with open('reviews/nltk_german_classifier_data.pickle', 'rb') as f:
        tagger = pickle.load(f)

    # sentence splitting
    comment = nltk.sent_tokenize(comment)

    lemmas = []

    for j in range(len(comment)):
        # tokenization
        comment[j] = nltk.word_tokenize(comment[j])

        # punctuation removal
        comment[j] = [
            token for token in comment[j]
            if token not in exclude and token.isalpha()
        ]

        # POS taging
        comment[j] = tagger.tag(comment[j])

        # lemmatization

        for k in range(len(comment[j])):
            try:
                lemmas.append(
                    lemmatizer.find_lemma(comment[j][k][0], comment[j][k][1]))
            except ValueError:
                pass

    # lower
    lemmas = [word.lower() for word in lemmas]

    # stopword removal
    topics = [word for word in lemmas if word not in exclude_words]

    # make topics html-safe
    topics_safe = [
        t.replace('ä', 'ae').replace('ü',
                                     'ue').replace('ö',
                                                   'oe').replace('ß', 'ss')
        for t in topics
    ]

    return topics, topics_safe
Exemple #20
0
"""

from processor import TextRank as tcf
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from germalemma import GermaLemma

#Further Pacakges for Preprocessing https://github.com/jfilter/german-preprocessing
#Germanlemma: https://github.com/WZBSocialScienceCenter/germalemma

#------------------------------Paramterter und Bezeichnung---------------------
gerLem = GermaLemma()
lem = WordNetLemmatizer()
stem = PorterStemmer()
#---------------------------------Funktionen-------------------------------------------


def PrePross(ListofSentences,_comma=False, Fuzzy=False, FuzzyRank=False,
             _reversed = False, Remove_specCar = False, IgnoreWord_list = [None],
             stem=False, stopwords=[]):
    '''
    Funktion um den Text vorbereiten. Braucht einen Dataframe und den Columnnamen,
    #indem sich die texte befinden. Im

    Args:
        ListofSentences (): Liste mit Textdaten
        _comma (): Bol - soll
class SentimentDetector:
    def __init__(self, path: str = "src/data/", windowSize=5) -> None:
        self.path = path
        self.windowSize = windowSize

        self.df_aspect_tokens = None
        self.df_preprocessed = None
        self.df_lexicon = None

        self.lemmatizer = GermaLemma()

    def downloadLexicon(
        self,
        filename: str = "sentiment_lexicon.csv",
        url:
        str = "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv",
        chunk_size: int = 1024,
    ) -> None:
        """
        Download sentiment lexicon.

        Args:
            filename (str, optional):  Defaults to "sentimentLexicon.csv".
            url (str, optional):  Defaults to "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv".
            chunk_size (int, optional): Defines chunk size for downloads of bigger files. Defaults to 128.
        """
        r = requests.get(url, stream=True)

        file_size = int(r.headers.get("Content-Length", None))
        num_bars = NP.ceil(file_size / (chunk_size))

        downloadProgress = tqdm(total=num_bars,
                                desc="Downloading Lexicon...",
                                unit="B",
                                unit_scale=True)

        with open(self.path + filename, "wb") as fd:
            for chunk in r.iter_content(chunk_size=chunk_size):
                downloadProgress.update(len(chunk))
                fd.write(chunk)
        downloadProgress.close()

    def loadCSVs(
        self,
        tokenFilename: str = "data_aspects_tokens.csv",
        preprocessedFilename: str = "data_preprocessed.csv",
        lexiconFilename: str = "sentiment_lexicon.csv",
    ) -> bool:
        """
        load all necessary CSV for execution of the detector and set indices as appropriate

        Args:
            tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv".
            preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv".
            lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv".

        Returns:
            bool: successful execution

        """
        try:
            if self.df_aspect_tokens is None or self.df_aspect_tokens.empty:
                self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename)

                self.df_aspect_tokens["polarity_strength"] = PD.NaT
                self.df_aspect_tokens["polarity_strength"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["sentiment_words"] = PD.NaT
                self.df_aspect_tokens["sentiment_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["intensifier_words"] = PD.NaT
                self.df_aspect_tokens["intensifier_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[
                    "word_found"].str.replace(r"[^\w]*", "", regex=True)

                # TODO remove after debugging
                # self.df_aspect_tokens = self.df_aspect_tokens[:100]

            if self.df_preprocessed is None or self.df_preprocessed.empty:
                self.df_preprocessed = PD.read_csv(self.path +
                                                   preprocessedFilename)

                # pandas read_csv does not read arrays correctly so we need to adjust those
                tqdm.pandas(desc="Applying Datatype Transformations....")
                self.df_preprocessed["tokens"] = self.df_preprocessed[
                    "tokens"].progress_apply(lambda x: json.loads(x))

            if self.df_lexicon is None or self.df_lexicon.empty:
                if not os.path.exists(self.path + lexiconFilename):
                    self.downloadLexicon()

                self.df_lexicon = PD.read_csv(self.path + lexiconFilename)
                self.df_lexicon.drop_duplicates(subset=["word", "qualifier"],
                                                inplace=True)
                self.df_lexicon.set_index("word", inplace=True)
                self.df_lexicon.drop("%%")

            return True
        except IOError as e:
            print(e)
            return False

    def loadSpacyModel(
        self,
        model: str = "de_core_news_lg",
        disableList: list[str] = ["ner", "textcat"],
    ) -> bool:
        """
        load the spacy model with required modes

        Args:
            model (str, optional): name of the mode. Defaults to "de_core_news_sm".
            disableList (list[str], optional): list of things to be disabled. Defaults to ["tagger", "parser", "ner"].
        """
        try:
            self.nlp = spacy.load(model, disable=disableList)
            return True
        except OSError:
            print("Model not found. Attempting to download..")
            try:
                spacy.cli.download(model)
            except Exception as e:
                print(e)
                return False
            self.nlp = spacy.load(model, disable=disableList)
            return True

    def checkValidChild(self, child, childType: ChildType) -> bool:
        if childType == ChildType.DESCRIPTOR:
            if (child.tag_ == "ADJA"
                    and child.pos_ == "ADJ") or (child.pos_ == "ADV"
                                                 and child.tag_ == "ADJD"):
                return True
            return False
        elif childType == ChildType.INTENSIFIER:
            if child.pos_ == "ADJ" or child.pos_ == "ADV":
                return True
            return False
        else:
            print("Wrong childType.")
            return False

    def checkPolarityAdjective(self, child, rowIdx) -> float:
        """
        check if the given word has an entry in the sentiment lexicon and return given polarity strength

        Args:
            child (spacy.Token): tokenized word with tagged 'pos_' and 'text'

        Returns:
            pol_strength (float): polarity_strength of given word found in sentiment lexicon
        """

        child_normalized = child.text.replace(r"[^\w]*", "")

        lexEntry = self.checkLexicon(child_normalized)

        if lexEntry is None:
            lexEntry = self.checkLexicon(child_normalized.lower())

        if lexEntry is None:
            lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_)
            lexEntry = self.checkLexicon(lemma)

        if lexEntry is None:
            return 1

        if type(lexEntry["qualifier"]) == str:
            pol_strength = lexEntry["polarity_strength"]
            if lexEntry["qualifier"] == "NEG":
                return -pol_strength
            return pol_strength
        else:
            for i, qualifier in enumerate(lexEntry["qualifier"].values):
                if qualifier == "POS":
                    return lexEntry["polarity_strength"][i]
                if qualifier == "NEG":
                    return -lexEntry["polarity_strength"][i]
            return 0

    def checkLexicon(self, word) -> PD.Series:
        """
        Check for valid lexicon entries return None if not found

        Args:
            word (str): word to be use as key

        Returns:
            PD.Series: Series that is found for the given key or None
        """
        try:
            return self.df_lexicon.loc[word]
        except KeyError:
            return None

    def checkForIntensifier(self, child, rowIdx) -> float:
        """
        For a given spacy.Token (child) check if any of the children is an intensifier and if so, return their polarity_strength

        Args:
            child (spacy.Token): tokenized word with tagged 'pos_' and 'text'

        Returns:
            polarity_multiplier (float): polarity_multiplier of found intensifier word
        """
        child_normalized = child.text.replace(r"[^\w]*", "")
        # catch words that are not in the sentiment lexicon

        lexEntry = self.checkLexicon(child_normalized)

        if lexEntry is None:
            lexEntry = self.checkLexicon(child_normalized.lower())

        if lexEntry is None:
            lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_)
            lexEntry = self.checkLexicon(lemma)

        if lexEntry is None:
            return 1

        if type(lexEntry["qualifier"]) == str:
            if lexEntry["qualifier"] == "INT":

                self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                    child.text)
                return lexEntry["polarity_strength"]
            elif lexEntry["qualifier"] == "SHI":
                self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                    child.text)
                return -1
            else:
                return 1

        else:
            for i, qualifier in enumerate(lexEntry["qualifier"].values):
                # TODO currently the first qualifier found is taken, without considering which the most fitting one is
                if qualifier == "INT":

                    self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                        child.text)
                    return lexEntry["polarity_strength"][i]
                elif qualifier == "SHI":
                    self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                        child.text)
                    return -1
            return 1

    def calcTotalPolarityStrength(self, child, rowIdx) -> float:
        """
        Calculate the total polarity for a given word

        Args:
            child (spacy.Token): the tokenized word with tagged 'pos_' and 'text'

        Returns:
            polarity_strength (float): the calculated polarity for the given word (child)
        """
        # lemma = self.lemmatizer.find_lemma(child.text, child.pos_)
        polarity_strength = self.checkPolarityAdjective(child, rowIdx)

        # find intensifier in children and multiply their strength to the polarity
        for c in child.children:
            if self.checkValidChild(c, ChildType.INTENSIFIER):
                polarity_strength *= self.checkForIntensifier(c, rowIdx)
        return polarity_strength

    def detectSentiment(self, rowDF: PD.Series) -> None:
        """
        Function to start the other relevent functions

        Args:
            rowDF (PD.Series): row of the Dataframe
        """
        doc = self.nlp(" ".join(self.df_preprocessed.iloc[
            rowDF["reviewnumber"]]["tokens"][rowDF["sent_idx"]]))

        for child in doc[rowDF["word_idx"]].children:
            # if child.tag_ == "ADJA":
            if self.checkValidChild(child, ChildType.DESCRIPTOR):
                pol_strength = self.calcTotalPolarityStrength(
                    child, rowDF.name)

                self.df_aspect_tokens["polarity_strength"][rowDF.name].append(
                    pol_strength)

                self.df_aspect_tokens["sentiment_words"][rowDF.name].append(
                    child.text)
                return

        for token in doc[rowDF["word_idx"]].ancestors:
            if token.pos_ == "AUX" or token.pos_ == "VERB":
                for child in token.children:
                    if self.checkValidChild(child, ChildType.DESCRIPTOR):
                        pol_strength = self.calcTotalPolarityStrength(
                            child, rowDF.name)

                        self.df_aspect_tokens["polarity_strength"][
                            rowDF.name].append(pol_strength)

                        self.df_aspect_tokens["sentiment_words"][
                            rowDF.name].append(child.text)
                        return

    def convert_polarity(self, qualifier, polarity):
        sentiment_polarity = []
        for i, elem in enumerate(qualifier):
            if elem == "NEG":
                sentiment_polarity.append(polarity[i] * -1)
            else:
                sentiment_polarity.append(polarity[i])
        sentiment_polarity = NP.mean(NP.array(sentiment_polarity))
        return sentiment_polarity

    def createReadableOutput(self, rowDF):
        appenddict = {
            "review_number":
            rowDF["reviewnumber"],
            "sentiment":
            self.convert_polarity(rowDF["qualifier"],
                                  rowDF["polarity_strength"]),
        }

        self.overall_sentiment = self.overall_sentiment.append(
            appenddict, ignore_index=True)

    def returnSentimentsforReviews(self) -> PD.DataFrame:
        self.overall_sentiment = PD.DataFrame(
            columns=["review_text", "sentiment"])
        tqdm.pandas(desc="Calculating Sentiments")
        self.df_aspect_tokens.progress_apply(
            lambda x: self.createReadableOutput(x), axis=1)

        self.overall_sentiment = (self.overall_sentiment.groupby(
            "review_number").mean().reset_index())
        # print(self.overall_sentiment)
        self.overall_sentiment["review_text"] = self.df_preprocessed[
            "text_normalized"][self.overall_sentiment["review_number"].astype(
                int).tolist()].tolist()

        return self.overall_sentiment

    def run(self) -> bool:
        """
        run all basic functions of the detector

        Returns:
            bool: successful execution of command
        """
        if not self.loadCSVs():
            print("Couldn't load CSV's.")
            return False

        if not self.loadSpacyModel():
            return

        true_labels = list()
        for index, row in self.df_aspect_tokens.iterrows():
            true_labels.append(self.df_preprocessed.iloc[row["reviewnumber"]][
                self.df_aspect_tokens.iloc[index]["aspect"]])
        self.df_aspect_tokens["true_label"] = true_labels

        tqdm.pandas(desc="Looking up Sentiments...")
        self.df_aspect_tokens.progress_apply(lambda x: self.detectSentiment(x),
                                             axis=1)

    def saveCSV(self, filename: str = "data_aspects_tokens.csv"):
        self.df_aspect_tokens["sentiment_words"] = self.df_aspect_tokens[
            "sentiment_words"].apply(lambda x: json.dumps(x))
        self.df_aspect_tokens.to_csv(self.path + filename, index=False)
def postprocess_spans(row, cl=None):
    """
    Method for better span detection as a postprocessing step after STWR classification.

    :param row: Each row consists of a label (format:"direct_speech,2,10") and a text.
    :param cl: label of the positive class instances.
    :return: The updated label
    """
    label = row.values[0]
    # Only do postprocessing for detected instances
    if label == "":
        return label

    text = row.values[1]
    doc = NLP(text)
    tokens = [token for token in doc]
    # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV
    token_lemmata = []
    lemmatizer = GermaLemma()

    for token in tokens:
        if token.pos_ == "VERB":
            token_lemmata.append(lemmatizer.find_lemma(token.text, 'V'))
        elif token.pos_ == "NOUN":
            token_lemmata.append(lemmatizer.find_lemma(token.text, 'N'))
        elif token.pos_ in ["ADJ", "ADV"]:
            token_lemmata.append(lemmatizer.find_lemma(token.text, token.pos_))
        else:
            token_lemmata.append(token.text)

    # Prepare information

    only_opening_quotes = [
        qu for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu]
    ]
    only_closing_quotes = [
        QUOTATION_MARKS[qu] for qu in QUOTATION_MARKS.keys()
        if qu != QUOTATION_MARKS[qu]
    ]
    # Do not treat apostrophes as possible quotation marks -> too risky
    both_quotes = [
        qu for qu in QUOTATION_MARKS.keys()
        if qu == QUOTATION_MARKS[qu] and qu != '\u0027'
    ]

    # Find quotation marks that can either be an opening or a closing quote but that don't have the same form as their counter part
    both = [qu for qu in only_opening_quotes if qu in only_closing_quotes]
    only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both]
    only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both]
    both_quotes = both_quotes + both

    # Load reporting word list
    stw_words_all = pd.read_excel("data/stw_words/stw_words_brunner2015.xls")
    # Only use words with penalty value up tp 3
    stw_words_all = stw_words_all[stw_words_all['Penalty'] <= 3]
    # Some words are only usable for reported class
    stw_words = stw_words_all[stw_words_all['Marker'] != 'rep']

    spans = []
    if cl == 'direct':

        # Search for quotation marks and try to decide whether they signify quoted STWR. Use conservative heuristics.
        for token in tokens:
            # Mark different candidates for quotation marks
            if token.text in only_opening_quotes:
                token.tag_ = "ONLY_OPENING_QUOTE"
            elif token.text in only_closing_quotes:
                token.tag_ = "ONLY_CLOSING_QUOTE"
            elif token.text in both_quotes:
                token.tag_ = "BOTH_QUOTES"

        stack = []
        for idx, token in enumerate(tokens):
            if token.tag_ == "ONLY_OPENING_QUOTE":
                stack.append((idx, token.text, token.tag_))

            elif token.tag_ in ["ONLY_CLOSING_QUOTE", "BOTH_QUOTES"]:
                # Check whether there is a matching opening quote on the stack
                found = False
                for i in range(len(stack) - 1, -1, -1):
                    top = stack[i]
                    if QUOTATION_MARKS[top[1]] == token.text:
                        found = True
                        # Closing quotes are usually preceded by sentence ending punctuation
                        if tokens[idx - 1].tag_ == '$.':
                            spans.append((top[0], idx))
                        stack = stack[:i]
                        break
                if not found:
                    # If no opening quotes were found and clear closing quotes are preceded by sentence ending punctuation,
                    # assume everything before is quoted
                    if token.tag_ == "ONLY_CLOSING_QUOTE" and idx > 0 and tokens[
                            idx - 1].tag_ == '$.':
                        spans.append((0, idx))
                    # If ambiguous quotation mark is found, decide whether it's opening or closing
                    elif token.tag_ == "BOTH_QUOTES":
                        if idx > 0 and tokens[idx - 1].tag_ == '$.':
                            spans.append((0, idx))
                        else:
                            stack.append((idx, token.text, token.tag_))

        # Check for open quotes in the stack
        if len(stack) > 0:
            # Choose first open quote in stack
            # Opening quotes are usually followed by capital letters (except continuing quotations, these are ignored here)
            opening = stack[0]
            if opening[0] < len(tokens) - 2:
                if tokens[opening[0] + 1].text.istitle():
                    spans.append((opening[0], len(tokens) - 1))

        # In case no quotation marks are there, look for colon
        if len(spans) == 0:
            for idx, token in enumerate(tokens):
                if ":" == token.text:
                    spans.append((idx, len(tokens) - 1))

    elif cl == 'indirect':

        # Following A.B.s directions for annotating indirect representations
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

        # Pattern 1: verbal framing phrase + dependent clause - assume max. one of these patterns per segment
        stw_verb_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if not lemma.istitle() and any(stw_words["Word"].str.contains(
                r'\b{}\b'.format(re.escape(lemma))))
        ]
        # Only use this pattern if there is a clear candidate
        if len(stw_verb_segment) == 1:
            verb = stw_verb_segment[0]
            dependent_clause = get_children(verb, exception=['sb'])

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token == verb:
                    start = i
                elif token in dependent_clause:
                    if start != None:
                        end = i

            if start != None and end != None:
                spans.append((start, end))

        # Pattern 2: nominal phrase includ. modificators + dependent clause - several of these patterns per segment are possible
        stw_noun_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if lemma.istitle() and any(stw_words["Word"].str.contains(
                r'\b{}\b'.format(re.escape(lemma))))
        ]

        for noun in stw_noun_segment:
            dependent_clause_modif = get_children(noun, exception=[])
            all_tokens = dependent_clause_modif + [noun]

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token in all_tokens:
                    if start == None:
                        start = i
                    else:
                        end = i

            if start != None and end != None:
                spans.append((start, end))

        # Merge spans
        merged_spans = []
        if len(spans) > 1:
            for i, span in enumerate(spans):
                for other in spans:
                    if other == span:
                        continue
                    else:
                        if span[0] >= other[0] and span[1] <= other[1]:
                            break
                        else:
                            merged_spans.append(span)

            spans = merged_spans

    elif cl == 'free_indirect':
        # Free indirect instances are almost always complete sentences -> leave as is
        pass

    elif cl == 'reported':
        # „Prinzipiell wird bei erzählter Wiedergabe angestrebt, den ganzen Satz oder Satzteil zu markieren, der eine Sprach-, Denk- oder Schreibhandlung wiedergibt.
        # – Wenn es möglich ist, mehrere unterschiedliche sprachliche, schriftliche oder gedankliche Handlungen zu identifizieren, so werden diese jeweils einzeln markiert.
        # – Wenn eine Nominalphrase mit einem Verb verwendet wird, so dass sich im Ganzen eine Sprach-, Denk- oder Schreibhandlung ergibt,
        # sollte – wie bei indirekter Wiedergabe – die ganze Verbalphrase markiert werden (also Pläne entwerfen, nicht nur Pläne).“
        # Following A.B.s directions for annotating reported representations try to annotate the whole clause for reported instances
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

        stw_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if any(stw_words_all["Word"].str.contains(r'\b{}\b'.format(
                re.escape(lemma))))
        ]

        for word in stw_segment:
            dependent_clause = get_children(word, exception=[])
            all_tokens = dependent_clause + [word]

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token in all_tokens:
                    if start == None:
                        start = i
                    else:
                        end = i

            if start != None and end != None:
                spans.append((start, end))
        # Don't merge spans as several different reported instance should be labeled separately following A.B.s directions for annotating reported representations
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

    # Get character based spans
    if len(spans) > 0:
        labels = []
        for span in spans:
            labels.append("{},{},{}".format(
                cl, tokens[span[0]].idx,
                (tokens[span[1]].idx + len(tokens[span[1]].text))))
        label = ",".join(labels)

    return label
Exemple #23
0
 def __init__(self, pos_prereq):
     self.pos_prereq = pos_prereq
     self.lemmatizer = GermaLemma(
         tiger_corpus=
         'resources/tiger_release_aug07.corrected.16012013.conll09')
Exemple #24
0
    def __init__(self, sequence_features=True):
        """
        :param sequence_features: If true, use the sequence features (trained on gold labels).
        """

        # Number of features
        self.num_features = 243
        # Names of features - needed for feature inspection
        self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM",
                              "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA",
                              "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS",
                              "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN",
                              "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT",
                              "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE",
                              "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART",
                              "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY",
                              "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF",
                              "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS",
                              "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP",
                              "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL",
                              "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV",
                              "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ",
                              "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN",
                              "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end",
                              "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes",
                              "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3",
                              "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind",
                              "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden",
                              "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker",
                              "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1",
                              "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1",
                              "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun",
                              "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2",
                              "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1",
                              "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5",
                              "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5",
                              "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5",
                              "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1",
                              "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5",
                              "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4",
                              "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2",
                              "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1",
                              "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5",
                              "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1",
                              "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5",
                              "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3",
                              "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1",
                              "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5",
                              "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4",
                              "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0",
                              "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4",
                              "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2",
                              "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5",
                              "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg",
                              "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect",
                              "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported",
                              "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported",
                              "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars",
                              "paragraph", "prev_paragraph"]

        # Switch to turn off sequence features
        self.sequence_features = sequence_features
        if not self.sequence_features:
            self.feature_names = self.feature_names[:-21] + self.feature_names[-8:]

        # Get all possible tags
        self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys())
        self.pos_map = sorted(spacy.parts_of_speech.NAMES.values())
        # Set up lemmatizer
        self.lemmatizer = GermaLemma()
        # Set up RFTagger
        call(["make"], cwd="RFTagger/src")
        # Load word vectors
        print("Loading word-vectors. This may take a while ...")
        self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True)
        print("Done.\n")
Exemple #25
0
class SentiDep:
    def __init__(self, **kwargs):
        """
            Sentiment-Analyzer for german texts.
            Get the polarity values of words depending on
            polarity values of associated descriptive words
            e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne'

            Purpose: find out in which sentiment context your keywords appear in a text.
            Note: Works with spacy, nltk and germalemma
        """
        sentiws_path = kwargs.get(
            'sentiws_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/sentiws.pickle"))
        polarity_mod_path = kwargs.get(
            'polarity_modifiers_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/polarity_modifiers.pickle"))
        negations_path = kwargs.get(
            'negations_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/negationen_lexicon.pickle"))
        stts_path = kwargs.get(
            'stts_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/stts.pickle"))
        self.sentiws = pickle.load(open(sentiws_path, 'rb'))
        self.polarity_modifications = pickle.load(open(polarity_mod_path,
                                                       'rb'))
        self.negations = pickle.load(open(negations_path, 'rb'))
        self.nlp = spacy.load("de_core_news_md")
        self.germalemmatizer = GermaLemma()
        self.stts = pickle.load(open(stts_path, 'rb'))
        self.german_stops = stopwords.words('german')

    def tokenize(self, text):
        """
        Tokenize a string using spacy's tokenizer.
        Input: text/string
        Output: spacy_doc
        """
        return self.nlp(text)

    def sentiws_spacy_tag_mapper(self, pos_tag, **kwargs):
        """
        Function for mapping SentiWS POS-tags to spacy POS-tags and reverse.
        Input: pos_tag, optional: direction
               -> values: 1 (sentiws to spacy), -1 (spacy to sentiws)
               -> default: 1
        Output: python str
        """
        direction = kwargs.get('direction', 1)
        senti_map = {
            "ADJX": "ADJ",
            "ADV": "ADV",
            "NN": "NOUN",
            "VVINF": "VERB"
        }
        if direction > 0:
            return senti_map[pos_tag]
        elif direction < 0:
            return {value: key for key, value in senti_map.items()}[pos_tag]

    def get_polarity(self, word, pos_tag):
        """
        Getter Function for retaining the polarity value by SentiWS for a certain word with POS-tag.
        Input: word, pos_tag
        Output: tuple(word, polarity-value, pos_tag)
        """
        senti_words = list(
            filter(
                lambda x: x[0] == word and self.sentiws_spacy_tag_mapper(x[2])
                == pos_tag, self.sentiws))
        if senti_words:
            senti_words = sorted(senti_words,
                                 key=lambda y: y[1]**2,
                                 reverse=True)[0]
            return senti_words

    def modify_polarity(self, child, polarity):
        """
        Function to consider polarity enhancer and reducer.
        Input: token.text, token.child.text, token.pos_ (of word)
        Output: tuple(word, polarity-value, pos_tag)
        """
        senti_word = polarity
        if senti_word:
            if child in self.polarity_modifications["polarity_enhancer"]:
                return (senti_word[0], senti_word[1] * 1.5, senti_word[2])
            elif child in self.polarity_modifications["polarity_reducer"]:
                return (senti_word[0], senti_word[1] * 0.5, senti_word[2])

    def easy_switch(self, word):
        """
        Function for finding depending negations without dealing with complex issues.
        Input: token/word
        Output: True/False
        """
        neg_search = [
            re.search(r'%s' % (n), word)
            for n in self.negations["negation_regex"]
        ]
        neg_search = list(filter(lambda z: z != None, neg_search))
        return bool(neg_search)

    def add_polarities(self, list_of_polarity_tuples):
        """
        Summing up a list of polarity-tuples
        :param list_of_polarity_tuples:
        :return: polarity value -> float
        """
        all_pols = [lpt[1] for lpt in list_of_polarity_tuples]
        return sum(all_pols)

    def calc_parent_polarity(self, spacy_token, token_polarity,
                             children_polarities):
        """
        Calculating the parent polarity value depending on the children polarities
        :param spacy_token:
        :param token_polarity:
        :param children_polarities:
        :return: parent_polarity -> tuple(word, polarity, POS-tag)
        """
        if token_polarity and children_polarities:
            added_children_polarities = self.add_polarities(
                children_polarities)
            if added_children_polarities > 0:
                token_polarity = (spacy_token.text, token_polarity[1] +
                                  added_children_polarities, spacy_token.pos_)
            elif added_children_polarities < 0:
                token_polarity = (spacy_token.text,
                                  (token_polarity[1] +
                                   (-1 * added_children_polarities)) * (-1),
                                  spacy_token.pos_)
        elif not token_polarity and children_polarities:
            token_polarity = (spacy_token.text,
                              self.add_polarities(children_polarities),
                              spacy_token.pos_)
        return token_polarity

    def switch_polarity(self, polarity, spacy_doc_sent):
        """
        Switching polarity value depending on negation context of whole sentence.
        Classic negation (kein, nicht, ...) are recognized as well as
        negation stops (aber, obwohl, ...)
        :param polarity:
        :param spacy_doc_sent:
        :return: tuple(word, polarity, POS-tag, negation: boolean)
        """
        negation_trigger = False
        for i, token in enumerate(spacy_doc_sent):
            for negex in self.negations['negation_regex']:
                regex = r'%s' % (negex)
                negation_search = re.search(regex, token.text, re.I)
                if negation_search:
                    negation_trigger = not negation_trigger
            if token.lower_ in self.negations['polarity_switches']:
                if token.text == '.':
                    if token.pos_ == 'PUNCT':
                        negation_trigger = not negation_trigger
                    else:
                        continue
                else:
                    negation_trigger = not negation_trigger
            if token.text == polarity[0]:
                if negation_trigger:
                    negated_polarity = (polarity[0], -polarity[1], polarity[2],
                                        "negation: " + str(negation_trigger))
                else:
                    negated_polarity = (polarity[0], polarity[1], polarity[2],
                                        "negation: " + str(negation_trigger))
                return negated_polarity

    def get_depending_polarities(self, text, keywords):
        """
        Get keyword associated polarity values of german texts.
        Polarity analysis including polarity reducer/enhancer and negations
        :param text:
        :param keywords:
        :return: Context-polarity value of keywords -> list of tuples
        """
        spacy_doc = self.nlp(text, disable=['ner', 'textcat'])
        parent_polarities = []
        keywords = [k.lower() for k in keywords]
        for sent in spacy_doc.sents:
            for i, token in enumerate(sent):
                token_polarity = self.get_polarity(token.text, token.pos_)
                children_polarities = []
                if token.lower_ in keywords:
                    children = token.children
                    if children:
                        for child in children:
                            child_polarity = self.get_polarity(
                                child.text, child.pos_)
                            if child_polarity:
                                children_polarities.append(child_polarity)
                    parent_polarity = self.calc_parent_polarity(
                        token, token_polarity, children_polarities)
                    if parent_polarity:
                        modified_parent_polarities = []
                        for child in children:
                            modified_parent_polarities.append(
                                self.modify_polarity(child, parent_polarity))
                        added_modified_parent_polarity = None
                        if modified_parent_polarities:
                            added_modified_parent_polarity = self.add_polarities(
                                modified_parent_polarities)
                        if added_modified_parent_polarity:
                            added_modified_parent_polarity = (
                                token.text, added_modified_parent_polarity,
                                token.pos_ + "_modified")
                            parent_polarities.append(
                                self.switch_polarity(
                                    added_modified_parent_polarity, sent))
                        else:
                            parent_polarities.append(
                                self.switch_polarity(parent_polarity, sent))
        parent_polarities = [(term.lower(), t_pol, t_pos, neg)
                             for term, t_pol, t_pos, neg in parent_polarities]
        return parent_polarities

    def lemmatize(self, spacy_token):
        """
        Lemmatizer using stts-tagset, spacy-token and GermaLemma.
        Input: spacy token -> german model
        Output: python str
        """
        tag = spacy_token.tag_
        if tag.startswith(('N', 'V', 'ADJ', 'ADV')) and tag in self.stts:
            return self.germalemmatizer.find_lemma(spacy_token.text, tag)
        else:
            return spacy_token.text

    def generate_topics(self, texts, num_topics=10):
        """
        Generate a list with 30 most frequent nouns in a text.
        Input: text -> len(text) <= 50000
        Output: nltk.FreqDist-object
        """
        tokens = [[token for token in self.tokenize(text)] for text in texts]
        tokens = [[self.lemmatize(t) for t in token if t.pos_ == 'NOUN'\
                  and t.lower_ not in self.german_stops] for token in tokens]
        docs = [" ".join(t) for t in tokens]
        cv = CountVectorizer(max_df=0.85, max_features=10000)
        word_count_vector = cv.fit_transform(docs)
        tf = TfidfTransformer(smooth_idf=True, use_idf=True)
        tf.fit(word_count_vector)
        feature_names = cv.get_feature_names()
        tf_idf_scores = []
        for doc in docs:
            cv_vector = cv.transform([doc])
            tf_idf_vector = tf.transform(cv_vector)
            sorted_items = self.sort_coo(tf_idf_vector.tocoo())
            keywords, scores = self.extract_topn_from_vector(
                feature_names, sorted_items, 10)
            tf_idf_scores += list(zip(keywords, scores))

        tfidf_topics = sorted(tf_idf_scores, key=lambda x: x[1], reverse=False)
        return dict(tfidf_topics[:num_topics])

    def sort_coo(self, coo_matrix):
        tuples = zip(coo_matrix.col, coo_matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

    def extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
        sorted_items = sorted_items[:topn]
        score_vals = []
        feature_vals = []

        for idx, score in sorted_items:
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]
        return results, score_vals

    def create_clinic_polarity_dict(self, key_list, topics):
        """
        Compute polarity scores document-wise
        :param key_list: list of polarity-scores and document-key
                         -> form: [[polarity-scores_1, document-key_1] ...]
                         -> hint: simple pandas dump with
                            df[[polarity-values, document]].values.tolist()
        :param topics: list of keywords associated with a certain topic
        :return: polarities_dict in form:
                 {document_key_1: polarities_1, ...}
        """
        polarities = {}
        clinic_counter = {}
        for rl in tqdm(key_list):
            if not rl[1] in clinic_counter.keys():
                clinic_counter[rl[1]] = 1
            key = f'{rl[1]}_{clinic_counter[rl[1]]}'
            polarities[key] = self.get_depending_polarities(rl[0], topics)
            clinic_counter[rl[1]] += 1
        return polarities

    def create_polarity_df(self, polarities, topics):
        """
        Transforms polarity-scores from 'create_clinic_polarity_dict' output
        to a formatted pandas dataframe
        :param polarities: polarities-dict (output from 'create_clinic_polarity_dict')
        :param topics: list of keywords associated with a certain topic
        :return: polarity_df (formatted pandas dataframe) of form:
                 columns: keywords/topics
                 rows: document-keys
                 values: float(polarity-scores) or np.nan
        """
        filtered_polarities = [(clinic, polarity)
                               for clinic, polarity in polarities.items()
                               if polarity]
        columns = {t: [] for t in topics}
        ids = {"Klinik": []}
        for clinic, polarity in tqdm(filtered_polarities):
            ids["Klinik"].append(clinic)
            row = {t: [] for t in topics}
            for pol in polarity:
                row[pol[0].lower()] = pol[1]
            for word, p in row.items():
                if not p:
                    columns[word].append(np.nan)
                else:
                    columns[word].append(p)
        for key, value in columns.items():
            if len(value) < len(ids["Klinik"]) or len(value) > len(
                    ids["Klinik"]):
                raise ValueError("Values in dict must have same length!")

        polarity_df = pd.DataFrame(data=columns, index=ids["Klinik"])
        return polarity_df

    '''