Example #1
0
    def annotate_stw(self, t, clf_class, majority_classes=None):
        """
        Method for annotating a segment with one of the classes speech, thought or writing given
        the STWR classification clf_class.

        :param t: The text of the segment.
        :param clf_class: One of direct, indirect, free_indirect, reported. The predicted class for t.
        :param majority_classes: A dictionary containing the majority classes (one of speech, thought or writing)
                                for each STWR class.
        :return: One of speech, thought or writing; the annotation for t.
        """
        # Get the stored majority classes if no other are given
        if not majority_classes:
            majority_classes = self.majority_classes

        # Direct and free_indirect should always be classified by majority classes as reporting words are more
        # likely to appear outside of segments of these classes.
        if clf_class in ['direct', 'free_indirect']:
            return majority_classes[clf_class]

        # For the other types check for reporting words with unambiguous type else use majority class
        doc = NLP(t)
        # Get lemmata with germalemma as spacy is not good at this
        lemmatizer = GermaLemma()

        lemmata = []
        for token in doc:
            if token.pos_ == "VERB":
                lemmata.append(lemmatizer.find_lemma(token.text, 'V'))

            elif token.pos_ == "NOUN":
                lemmata.append(lemmatizer.find_lemma(token.text, 'N'))

        if len(lemmata) > 0:
            stw_words_t = pd.concat([
                self.stw_words[self.stw_words["Word"].str.contains(
                    r'\b{}\b'.format(re.escape(lemma)))] for lemma in lemmata
            ],
                                    axis=0,
                                    ignore_index=True)
        else:
            stw_words_t = []

        if len(stw_words_t) == 1:
            if stw_words_t["Type"][0] in ["speech", "thought", "writing"]:
                return stw_words_t["Type"][0]
            else:
                return majority_classes[clf_class]

        else:
            return majority_classes[clf_class]
Example #2
0
def lemmatize_tokens(tokens):
    lemmatizer = GermaLemma()
    new_tokens = {}
    for doc_label, tok_pos in tokens.items():
        lemmata_pos = []
        for t, pos in tok_pos:
            try:
                l = lemmatizer.find_lemma(t, pos)
            except ValueError:
                l = t
            lemmata_pos.append((l, pos))
            new_tokens[doc_label] = lemmata_pos

    return new_tokens
Example #3
0
class GermaLemma(PipelineModule):
    def __init__(self, pos_prereq):
        self.pos_prereq = pos_prereq
        self.lemmatizer = GermaLemma(
            tiger_corpus=
            'resources/tiger_release_aug07.corrected.16012013.conll09')

    def targets(self):
        return {'lemma-germalemma'}

    def prerequisites(self):
        return {'token', self.pos_prereq}

    def make(self, prerequisite_data):
        tokens = prerequisite_data['token']
        pos = prerequisite_data[self.pos_prereq]

        pattern1 = re.compile("^[NV]")
        pattern2 = re.compile("^(ADJ|ADV)")

        def lemmatize_token(t, postag):
            try:
                if pattern1.match(postag):
                    return self.lemmatizer.find_lemma(t, postag)
                elif pattern2.match(postag):
                    return self.lemmatizer.find_lemma(t, postag[:3])
                else:
                    return 0
            except Exception as e:
                sys.stderr.write(
                    f"Lemmatizing {t} ({postag}) raised exception: {e}\n")
                return 0

        return {
            'lemma-germalemma':
            list(map(lambda x: lemmatize_token(x[0], x[1]), zip(tokens, pos)))
        }
Example #4
0
class STWRFeatureExtractor(object):
    """
    Feature extractor for classifiying STWR.
    """

    def __init__(self, sequence_features=True):
        """
        :param sequence_features: If true, use the sequence features (trained on gold labels).
        """

        # Number of features
        self.num_features = 243
        # Names of features - needed for feature inspection
        self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM",
                              "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA",
                              "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS",
                              "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN",
                              "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT",
                              "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE",
                              "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART",
                              "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY",
                              "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF",
                              "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS",
                              "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP",
                              "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL",
                              "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV",
                              "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ",
                              "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN",
                              "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end",
                              "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes",
                              "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3",
                              "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind",
                              "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden",
                              "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker",
                              "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1",
                              "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1",
                              "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun",
                              "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2",
                              "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1",
                              "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5",
                              "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5",
                              "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5",
                              "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1",
                              "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5",
                              "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4",
                              "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2",
                              "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1",
                              "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5",
                              "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1",
                              "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5",
                              "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3",
                              "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1",
                              "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5",
                              "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4",
                              "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0",
                              "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4",
                              "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2",
                              "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5",
                              "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg",
                              "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect",
                              "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported",
                              "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported",
                              "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars",
                              "paragraph", "prev_paragraph"]

        # Switch to turn off sequence features
        self.sequence_features = sequence_features
        if not self.sequence_features:
            self.feature_names = self.feature_names[:-21] + self.feature_names[-8:]

        # Get all possible tags
        self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys())
        self.pos_map = sorted(spacy.parts_of_speech.NAMES.values())
        # Set up lemmatizer
        self.lemmatizer = GermaLemma()
        # Set up RFTagger
        call(["make"], cwd="RFTagger/src")
        # Load word vectors
        print("Loading word-vectors. This may take a while ...")
        self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True)
        print("Done.\n")

    def transform(self, text, original_text = None, backlog=[]):
        """
        Method that transforms the given segments into their feature representation.
        Expects dataframe with column ["text"] or list of spacy tokens along with the original text or string.

        :param text: dataframe with column ["text"] that contains the string segments or list of spacy tokens.
        :param original_text: the original text as string is passed in test mode.
        :param backlog: For test mode, the backlog stores info and labels of former segments and
                        therefore has to be passed back and forth between classifier and feature extractor.
        :return: The transformed segments as pandas Dataframe or list, depending on the type of 'text'
        """

        # If the backlog has not been initialized, initialize it
        if len(backlog) == 0:
            backlog = ["" for i in range(10)] + [0 for i in range(64)]

        # If spacy tokenization and quote annotation has not been performed, do it now
        if type(text) == list:
            tokens = text

        elif type(text) == pd.DataFrame:
            # Get full text for better results in spacy parsing
            full_text = " ".join(text['text'].values)

            doc = NLP(full_text)
            # Exchange tags for quotation marks for special tokens: #OPEN_QUOTE#, #CLOSE_QUOTE#
            doc = annotate_quotes(doc)
            tokens_full_text = [token for token in doc]

        # Transform individual segments
        if type(text) == list:
            return self.transform_segment(tokens, backlog, original_text)

        else:
            output = pd.DataFrame()
            print("Extracting features...")
            for ind, row in text.iterrows():
                # print progress bar
                sys.stdout.write('\r')
                # the exact output you're looking for:
                sys.stdout.write("[%-20s] %d%%" % ('=' * round(ind/(len(text)/20)), round(ind/(len(text)/100))))
                sys.stdout.flush()

                # Get the tokens corresponding to the segment:
                tokens_text = string_tokenize(row['text'])
                tokens = tokens_full_text[:len(tokens_text)]

                # Check that this is correct
                assert tokens_text[-1] == tokens[-1].text
                tokens_full_text = tokens_full_text[len(tokens_text):]

                transformed, backlog = self.transform_segment(tokens, backlog, row['text'])
                output = output.append(pd.Series(transformed), ignore_index = True)

                # Adapt backlog: backlog stores last ten classifications in the first ten positions
                backlog[0:10] = backlog[1:10] + [row['labels_spans']]

            return output, backlog

    def transform_segment(self, tokens, backlog, original_text):
        """
        Transforms an individual segment of tokens, given the information in the backlog,
        into a feature representation.

        :param tokens: list of spacy tokens
        :param backlog: list containing information about the labels and other features of previous segments
        :param original_text: The original text as string
        :return: the feature representation and the updated backlog
        """

        # --- Preprocessing ---
        transformed = []
        token_strings = [token.text for token in tokens]
        # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV
        token_lemmata = []
        for token in tokens:
            if token.pos_ == "VERB":
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'V'))
            elif token.pos_ == "NOUN":
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'N'))
            elif token.pos_ in ["ADJ", "ADV"]:
                token_lemmata.append(self.lemmatizer.find_lemma(token.text, token.pos_))
            else:
                token_lemmata.append(token.text)

        # Load reporting word list
        stw_words_orig = pd.read_excel("data/stw_words/stw_words_brunner2015.xls")
        # Some words are only usable for reported class
        stw_words_rep = stw_words_orig[stw_words_orig['Marker'] == 'rep']
        stw_words = stw_words_orig[stw_words_orig['Marker'] != 'rep']

        # Do deeper morphological analysis with RFTagger
        file = open("RFTagger/temp.txt", "w")
        file.write("\n".join(token_strings))
        file.close()
        morph_tagged = check_output(["src/rft-annotate", "lib/german.par", "temp.txt"], cwd="RFTagger", stderr=FNULL).decode(
            "utf-8").split("\n")
        # Split morph tags into attributes
        morph_tagged = [morph_tag.split("\t")[1].split(".") if morph_tag != "" else morph_tag for morph_tag in morph_tagged]

        # --- Pos tag features ---
        tags = [token.tag_ for token in tokens]
        pos = [token.pos_ for token in tokens]
        transformed += [(tags.count(tag)/len(tags)) if tag in tags else 0 for tag in self.tag_map]
        transformed += [(pos.count(p) / len(pos)) if p in pos else 0 for p in self.pos_map]

        # --- NE features ---
        doc = NLP(original_text)
        transformed.append(len(doc.ents))
        for ne_type in NE_TYPES:
            transformed.append(int(len([ent for ent in doc.ents if ent.label_ == ne_type]) > 0))

        # --- Special token features ---
        # Colon in this or in previous segment?
        colon_this = int(":" in token_strings)
        transformed.append(colon_this)
        transformed.append(backlog[10])
        # Comma at the end of this segment means that the next segment is an embedded sentence if it has a verb
        comma_end = int(tags[-1] == '$,')
        transformed.append(comma_end)

        # Percentage of 'emphatic' punctuation marks: ?,!,-,–
        transformed.append((token_strings.count('?') + token_strings.count('!') + token_strings.count('-') + token_strings.count('–'))/len(token_strings))
        # Question?
        transformed.append(int((token_strings.count('?') > 0)))

        # Quotes features
        # Opening Quotes in this segment?
        open_quote = len([tag for tag in tags if tag == "#OPEN_QUOTE#"])
        # Closing Quotes in this segment?
        close_quote = len([tag for tag in tags if tag == "#CLOSE_QUOTE#"])
        # In quotes?
        in_quotes = int(backlog[11] > 0 or open_quote > 0)
        transformed.append(open_quote)
        transformed.append(close_quote)
        transformed.append(in_quotes)
        # How many contiguous prev. segments have been in quotes so far? This is meant to tackle errors bc of missing closing quotes
        # as well as marking sequences of embedded narration
        transformed.append(backlog[49])

        # Special combinations direct - full quoted sentence (sent. ending punct. before closing quotes),
        # comma after closing quotes (prob. frame of direct speech)
        transformed.append(int(len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i > 0 and tags[i-1] == "$."]) > 0))
        transformed.append(int((backlog[12] == 1 and token_strings[0] == ",") or (len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i < len(token_strings)-1 and token_strings[i+1] == ","]) > 0)))

        # --- Morphological Features ---
        # percentage of first and second person pronouns (personal, possessive, reflexive)
        per1 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '1']
        per2 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '2']
        per12 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] in ['1', '2']]
        transformed.append(len(per1) / len(token_strings))
        # Second person might be a better feature than 1. and 2. together as it is seldom the perspective of a narrative
        transformed.append(len(per2) / len(token_strings))
        transformed.append(len(per12)/len(token_strings))
        # percentage of third person pronouns (personal, possessive, reflexive)
        per3 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and
                 morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '3']
        transformed.append(len(per3) / len(token_strings))

        # Note changes in the usage of person; this might help to distinguish between third and first person perspective narratives
        # Only third person in prev. five segments?
        transformed.append(int(len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b in ['1', '1_3']]) == 0))
        # Only first person in prev. five segments?
        transformed.append(int(len([b for b in backlog[43:48] if b == '1']) > 0 and len([b for b in backlog[43:48] if b in ['3', '1_3']]) == 0))
        # Mixed first and third person in prev. five segments
        transformed.append(int(len([b for b in backlog[43:48] if b == '3_1']) > 0 or (len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b == '1']) > 0)))

        # tempus and modus features
        has_ind = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[5] == 'Ind']) > 0)
        has_subj = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[5] == 'Subj']) > 0)
        no_subj = int(not any([morph_tag[5] == 'Subj' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        no_ind = int(not any([morph_tag[5] == 'Ind' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        has_pres = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[4] == 'Pres']) > 0)
        has_past = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and
                           morph_tag[4] == 'Past']) > 0)
        no_past = int(not any([morph_tag[4] == 'Past' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        no_pres = int(not any([morph_tag[4] == 'Pres' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN']))
        for feature in [has_ind, has_subj, no_subj, no_ind, has_pres, has_past, no_past, no_pres]:
            transformed.append(feature)

        # --- Grammatical features ---
        # Comma at the end of the prev. segment means that this segment is an embedded sentence if it has a verb
        if backlog[13] and any([tag in ['VFIN', 'VAFIN'] for tag in tags]):
            transformed.append(1)
        else:
            transformed.append(0)
        # A form of verb 'würden' + infinitive can be a pointer towards free indirect
        transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata])
                               and any(
            [(tag in ['VAINF', 'VMINF', 'VVINF', 'VVIZU'] and token_lemmata[i] != 'würden') for i, tag in
             enumerate(tags)])))
        transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata])))

        # Noun/prepositional complements of a rep. word point toward reported STW,
        # sentence/infinitive complements point towards indirect STW
        all_stw_words = [token for i,token in enumerate(tokens) if any(stw_words_orig["Word"].str.contains(r'\b{}\b'.format(re.escape(token_lemmata[i]))))]
        has_prep_noun_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.pos_ in ['ADP', 'PROPN', 'NOUN'] and child.dep_.startswith('o')]) > 0]) > 0)
        has_claus_inf_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.dep_ == 'oc']) > 0]) > 0)
        transformed.append(has_prep_noun_comp)
        transformed.append(has_claus_inf_comp)

        # --- Possible speaker features ---
        # Is subject a pronoun, a person NE or a "Person" head noun -> possible speaker
        cand_speakers = [tokens[i] for i,tag in enumerate(tags) if (tag in['PPER', 'PIS', 'PDS'] or (tag in ['NE', 'NNE'] and 'PER' in [ent for ent in doc.ents if tokens[i].idx >= ent.start and tokens[i].idx <= ent.end]))]

        # Check whether any noun phrase has a head that is a synset of "Person" in Germanet
        person = []
        with open('data/person.txt', 'r', encoding='utf-8') as f:
            for l in f:
                person.append(l)

        for np in doc.noun_chunks:
            if np.root.text in person:
                cand_speakers.append(np.root)

        subj_cand_speaker = [token for token in cand_speakers if token.dep_ == 'sb']
        # How many possible speakers/addressees are there in relation to the segment length?
        num_cand_speaker = len(cand_speakers)/len(tokens)
        transformed.append(int(len(subj_cand_speaker) > 0))
        transformed.append(num_cand_speaker)
        # Append prev. segments candidate speaker features
        transformed.append(backlog[38])
        transformed.append(backlog[39])

        # --- Reporting word features ---
        # Appearance of reporting word by penalty
        has_rep_word_0 = int(any([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_1 = int(any([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_2 = int(any([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_3 = int(any([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_4 = int(any([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_5 = int(any([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))

        # Appearance of reporting word lower or equal a certain penalty
        has_rep_word_le_1 = int(any([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_2 = int(any([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_3 = int(any([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_4 = int(any([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_rep_word_le_5 = int(any([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        # Appearance of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect'
        has_rep_word_noun = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata]))
        has_rep_word_verb = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata]))
        for feature in [has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5,
                        has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5,
                        has_rep_word_noun, has_rep_word_verb]:
            transformed.append(feature)

        # Appearance of special reporting words for reported class by penalty
        has_spec_rep_word_0 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))

        # Appearance of special reporting words lower or equal a certain penalty
        has_spec_rep_word_le_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        has_spec_rep_word_le_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]))
        
        for feature in [has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4,
                        has_spec_rep_word_5,
                        has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4,
                        has_spec_rep_word_le_5]:
            transformed.append(feature)

        # Number of reporting word by penalty
        num_rep_word_0 = sum([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_1 = sum([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_2 = sum([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_3 = sum([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_4 = sum([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_5 = sum([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        # Number of reporting word lower or equal a certain penalty
        num_rep_word_le_1 = sum([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_2 = sum([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_3 = sum([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_4 = sum([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_rep_word_le_5 = sum([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        # Number of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect'
        num_rep_word_noun = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata])
        num_rep_word_verb = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata])
        for feature in [num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4,
                        num_rep_word_5,
                        num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4,
                        num_rep_word_le_5,
                        num_rep_word_noun, num_rep_word_verb]:
            transformed.append(feature)

        # Number of special reporting words for reported class by penalty
        num_spec_rep_word_0 = sum([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_1 = sum([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_2 = sum([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_3 = sum([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_4 = sum([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_5 = sum([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        # Number of special reporting words lower or equal a certain penalty
        num_spec_rep_word_le_1 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_2 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_3 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_4 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])
        num_spec_rep_word_le_5 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])

        for feature in [num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3,
                        num_spec_rep_word_4,
                        num_spec_rep_word_5,
                        num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3,
                        num_spec_rep_word_le_4,
                        num_spec_rep_word_le_5]:
            transformed.append(feature)
            
        # Reporting word features prev. segment
        for feature in backlog[14:38]:
            transformed.append(feature)
        for feature in backlog[50:74]:
            transformed.append(feature)

        # Word vectors
        # Get prototypical word vector for reporting words
        proto_rep_vec = numpy.average([self.wordvecs[word] for word in stw_words[stw_words["Penalty"] == 0] if word in self.wordvecs], axis=0)
        # Get prototypical word vector for reported class
        proto_rep_vec_reporting = numpy.average([self.wordvecs[word] for word in stw_words_rep[stw_words_rep["Penalty"] == 0] if word in self.wordvecs], axis=0)
        # Append highest similarity values to proto word vectors within the segment
        max_sim = .0
        max_sim_rep = .0
        for lemma in token_lemmata:
            if lemma in self.wordvecs:
                lemma_vec = self.wordvecs[lemma]
                # cosine similarity = 1 - cosine distance
                sim = 1 - distance.cosine(lemma_vec, proto_rep_vec)
                sim_rep = 1 - distance.cosine(lemma_vec, proto_rep_vec_reporting)

                if sim > max_sim:
                    max_sim = sim
                if sim_rep > max_sim_rep:
                    max_sim_rep = sim_rep

        transformed.append(max_sim)
        transformed.append(max_sim_rep)

        # --- Other word features ---
        # Usage of deictic words can point to character speech - precentage of deictic words
        transformed.append(len([t for t in token_strings if t in DEICTIC])/len(token_strings))
        # Usage of special conjunction at the beginning of the segment can point to indirect
        transformed.append(int(token_strings[0] in CONJUNCT))
        # Usage of modal particles can point towards character speech
        transformed.append(len([t for t in token_strings if t in MODAL_PART])/len(token_strings))
        # Negation?
        transformed.append(len([lemma for lemma in token_lemmata if lemma in NEG])/len(token_strings))

        # Words describing facial expressions, gestures, voice might hint towards STWR
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in FACIAL]) > 0))
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in GESTURE]) > 0))
        transformed.append(int(len([lemma for lemma in token_lemmata if lemma in VOICE]) > 0))

        # The repetition of words can hint towards figural speech
        transformed.append(int(any([count >= 2 for count in [token_lemmata.count(el) for el in token_lemmata]])))

        # --- Sequential features ---

        if self.sequence_features:

            # Labels of prev. segment
            labels_last = [l for i,l in enumerate(backlog[9].split(",")) if i%3==0]
            transformed.append(int(any([l.startswith('direct') for l in labels_last])))
            transformed.append(int(any([l.startswith('indirect') for l in labels_last])))
            transformed.append(int(any([l.startswith('free_indirect') for l in labels_last])))
            transformed.append(int(any([l.startswith('reported') for l in labels_last])))
            # Label appears in 5 prev. segments
            labels_last_5 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[5:10]] for fin_l in ls]
            transformed.append(int(any([l.startswith('direct') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('indirect') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('free_indirect') for l in labels_last_5])))
            transformed.append(int(any([l.startswith('reported') for l in labels_last_5])))
            # How many labels for each class and overall within the last 10 segments
            labels_last_10 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[0:10]] for fin_l in ls if fin_l != ""]
            transformed.append(len([l for l in labels_last_10 if l.startswith('direct')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('indirect')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('free_indirect')]))
            transformed.append(len([l for l in labels_last_10 if l.startswith('reported')]))
            transformed.append(len(labels_last_10))

        # --- Other features ---
        # Segment and character lengths
        transformed.append(len(token_strings))
        transformed.append(len(original_text))
        # Segment and character lengths of prev. segment
        transformed.append(backlog[40])
        transformed.append(backlog[41])
        # Segment and character lengths of this + prev. segment
        transformed.append(len(token_strings) + backlog[40])
        transformed.append(len(original_text) + backlog[41])
        # Is this segment at the start or end of a paragraph?
        paragraph_end = int("<p>" in original_text)
        transformed.append(paragraph_end)
        transformed.append(backlog[42])

        # --- Update Backlog ---
        # [0:10] encode labels of previous ten segments -> updated elsewhere
        # 10: Colon in prev. segment
        backlog[10] = colon_this
        # 11: How many open quotes
        backlog[11] += open_quote
        if backlog[11] - close_quote >= 0:
            backlog[11] -= close_quote
        else:
            backlog[11] = 0
        # 12: Prev. segment ends with close_quote
        backlog[12] = int(tags[-1] == "#CLOSE_QUOTE#")
        # 13: Comma at the end of this segment
        backlog[13] = comma_end
        # [14:38] reportin word appearance features prev. segment
        for i, feature in enumerate([has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4,
                                     has_rep_word_5,
                                     has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4,
                                     has_rep_word_le_5,
                                     has_rep_word_noun, has_rep_word_verb,
                                     has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3,
                                     has_spec_rep_word_4, has_spec_rep_word_5,
                                     has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3,
                                     has_spec_rep_word_le_4, has_spec_rep_word_le_5
            ]):
            backlog[14 + i] = feature
        # 38: Candidate speakers as subject
        backlog[38] = int(len(subj_cand_speaker) > 0)
        # 39: Percentage of candidate speakers
        backlog[39] = num_cand_speaker
        # 40, 41: lengths of prev. segment
        backlog[40] = len(token_strings)
        backlog[41] = len(original_text)
        # 42: paragraph end
        backlog[42] = paragraph_end

        # [43:48]: keep track of pronoun person appearances in the 5 prev. segments
        backlog[43:47] = backlog[44:48]
        if per3:
            if per1:
                backlog[48] = '3_1'
            else:
                backlog[48] = '3'
        elif per1:
            backlog[48] = '1'
        else:
            backlog[48] = '-'

        # 49: How many contiguous prev. segments have been in quotes?
        if in_quotes:
            backlog[49] += 1
        else:
            backlog[49] = 0

        # [50:74] reportin word count features prev. segment
        for i, feature in enumerate([num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4,
                                     num_rep_word_5,
                                     num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4,
                                     num_rep_word_le_5,
                                     num_rep_word_noun, num_rep_word_verb,
                                     num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2,
                                     num_spec_rep_word_3,
                                     num_spec_rep_word_4, num_spec_rep_word_5,
                                     num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3,
                                     num_spec_rep_word_le_4, num_spec_rep_word_le_5
                                     ]):
            backlog[50 + i] = feature

        return transformed, backlog
def postprocess_spans(row, cl=None):
    """
    Method for better span detection as a postprocessing step after STWR classification.

    :param row: Each row consists of a label (format:"direct_speech,2,10") and a text.
    :param cl: label of the positive class instances.
    :return: The updated label
    """
    label = row.values[0]
    # Only do postprocessing for detected instances
    if label == "":
        return label

    text = row.values[1]
    doc = NLP(text)
    tokens = [token for token in doc]
    # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV
    token_lemmata = []
    lemmatizer = GermaLemma()

    for token in tokens:
        if token.pos_ == "VERB":
            token_lemmata.append(lemmatizer.find_lemma(token.text, 'V'))
        elif token.pos_ == "NOUN":
            token_lemmata.append(lemmatizer.find_lemma(token.text, 'N'))
        elif token.pos_ in ["ADJ", "ADV"]:
            token_lemmata.append(lemmatizer.find_lemma(token.text, token.pos_))
        else:
            token_lemmata.append(token.text)

    # Prepare information

    only_opening_quotes = [
        qu for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu]
    ]
    only_closing_quotes = [
        QUOTATION_MARKS[qu] for qu in QUOTATION_MARKS.keys()
        if qu != QUOTATION_MARKS[qu]
    ]
    # Do not treat apostrophes as possible quotation marks -> too risky
    both_quotes = [
        qu for qu in QUOTATION_MARKS.keys()
        if qu == QUOTATION_MARKS[qu] and qu != '\u0027'
    ]

    # Find quotation marks that can either be an opening or a closing quote but that don't have the same form as their counter part
    both = [qu for qu in only_opening_quotes if qu in only_closing_quotes]
    only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both]
    only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both]
    both_quotes = both_quotes + both

    # Load reporting word list
    stw_words_all = pd.read_excel("data/stw_words/stw_words_brunner2015.xls")
    # Only use words with penalty value up tp 3
    stw_words_all = stw_words_all[stw_words_all['Penalty'] <= 3]
    # Some words are only usable for reported class
    stw_words = stw_words_all[stw_words_all['Marker'] != 'rep']

    spans = []
    if cl == 'direct':

        # Search for quotation marks and try to decide whether they signify quoted STWR. Use conservative heuristics.
        for token in tokens:
            # Mark different candidates for quotation marks
            if token.text in only_opening_quotes:
                token.tag_ = "ONLY_OPENING_QUOTE"
            elif token.text in only_closing_quotes:
                token.tag_ = "ONLY_CLOSING_QUOTE"
            elif token.text in both_quotes:
                token.tag_ = "BOTH_QUOTES"

        stack = []
        for idx, token in enumerate(tokens):
            if token.tag_ == "ONLY_OPENING_QUOTE":
                stack.append((idx, token.text, token.tag_))

            elif token.tag_ in ["ONLY_CLOSING_QUOTE", "BOTH_QUOTES"]:
                # Check whether there is a matching opening quote on the stack
                found = False
                for i in range(len(stack) - 1, -1, -1):
                    top = stack[i]
                    if QUOTATION_MARKS[top[1]] == token.text:
                        found = True
                        # Closing quotes are usually preceded by sentence ending punctuation
                        if tokens[idx - 1].tag_ == '$.':
                            spans.append((top[0], idx))
                        stack = stack[:i]
                        break
                if not found:
                    # If no opening quotes were found and clear closing quotes are preceded by sentence ending punctuation,
                    # assume everything before is quoted
                    if token.tag_ == "ONLY_CLOSING_QUOTE" and idx > 0 and tokens[
                            idx - 1].tag_ == '$.':
                        spans.append((0, idx))
                    # If ambiguous quotation mark is found, decide whether it's opening or closing
                    elif token.tag_ == "BOTH_QUOTES":
                        if idx > 0 and tokens[idx - 1].tag_ == '$.':
                            spans.append((0, idx))
                        else:
                            stack.append((idx, token.text, token.tag_))

        # Check for open quotes in the stack
        if len(stack) > 0:
            # Choose first open quote in stack
            # Opening quotes are usually followed by capital letters (except continuing quotations, these are ignored here)
            opening = stack[0]
            if opening[0] < len(tokens) - 2:
                if tokens[opening[0] + 1].text.istitle():
                    spans.append((opening[0], len(tokens) - 1))

        # In case no quotation marks are there, look for colon
        if len(spans) == 0:
            for idx, token in enumerate(tokens):
                if ":" == token.text:
                    spans.append((idx, len(tokens) - 1))

    elif cl == 'indirect':

        # Following A.B.s directions for annotating indirect representations
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

        # Pattern 1: verbal framing phrase + dependent clause - assume max. one of these patterns per segment
        stw_verb_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if not lemma.istitle() and any(stw_words["Word"].str.contains(
                r'\b{}\b'.format(re.escape(lemma))))
        ]
        # Only use this pattern if there is a clear candidate
        if len(stw_verb_segment) == 1:
            verb = stw_verb_segment[0]
            dependent_clause = get_children(verb, exception=['sb'])

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token == verb:
                    start = i
                elif token in dependent_clause:
                    if start != None:
                        end = i

            if start != None and end != None:
                spans.append((start, end))

        # Pattern 2: nominal phrase includ. modificators + dependent clause - several of these patterns per segment are possible
        stw_noun_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if lemma.istitle() and any(stw_words["Word"].str.contains(
                r'\b{}\b'.format(re.escape(lemma))))
        ]

        for noun in stw_noun_segment:
            dependent_clause_modif = get_children(noun, exception=[])
            all_tokens = dependent_clause_modif + [noun]

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token in all_tokens:
                    if start == None:
                        start = i
                    else:
                        end = i

            if start != None and end != None:
                spans.append((start, end))

        # Merge spans
        merged_spans = []
        if len(spans) > 1:
            for i, span in enumerate(spans):
                for other in spans:
                    if other == span:
                        continue
                    else:
                        if span[0] >= other[0] and span[1] <= other[1]:
                            break
                        else:
                            merged_spans.append(span)

            spans = merged_spans

    elif cl == 'free_indirect':
        # Free indirect instances are almost always complete sentences -> leave as is
        pass

    elif cl == 'reported':
        # „Prinzipiell wird bei erzählter Wiedergabe angestrebt, den ganzen Satz oder Satzteil zu markieren, der eine Sprach-, Denk- oder Schreibhandlung wiedergibt.
        # – Wenn es möglich ist, mehrere unterschiedliche sprachliche, schriftliche oder gedankliche Handlungen zu identifizieren, so werden diese jeweils einzeln markiert.
        # – Wenn eine Nominalphrase mit einem Verb verwendet wird, so dass sich im Ganzen eine Sprach-, Denk- oder Schreibhandlung ergibt,
        # sollte – wie bei indirekter Wiedergabe – die ganze Verbalphrase markiert werden (also Pläne entwerfen, nicht nur Pläne).“
        # Following A.B.s directions for annotating reported representations try to annotate the whole clause for reported instances
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

        stw_segment = [
            tokens[i] for i, lemma in enumerate(token_lemmata)
            if any(stw_words_all["Word"].str.contains(r'\b{}\b'.format(
                re.escape(lemma))))
        ]

        for word in stw_segment:
            dependent_clause = get_children(word, exception=[])
            all_tokens = dependent_clause + [word]

            start = None
            end = None

            for i, token in enumerate(tokens):
                if token in all_tokens:
                    if start == None:
                        start = i
                    else:
                        end = i

            if start != None and end != None:
                spans.append((start, end))
        # Don't merge spans as several different reported instance should be labeled separately following A.B.s directions for annotating reported representations
        # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.)

    # Get character based spans
    if len(spans) > 0:
        labels = []
        for span in spans:
            labels.append("{},{},{}".format(
                cl, tokens[span[0]].idx,
                (tokens[span[1]].idx + len(tokens[span[1]].text))))
        label = ",".join(labels)

    return label
Example #6
0
    ci_upper = match_mean + 1.96 * match_se
    ci_lower = match_mean - 1.96 * match_se

    return match_mean * 100, ci_lower * 100, ci_upper * 100


print("loading data...")

eval_df = pd.read_csv('eval_table/eval_table_lemmata.csv')
eval_df = eval_df.loc[~eval_df.lemma.isna(), :]

print('loaded %d rows' % len(eval_df))

lemmatizer = GermaLemma()

eval_df['germalemma'] = eval_df.apply(lambda row: lemmatizer.find_lemma(row[3], row[2]), axis=1)

eval_df['match'] = eval_df.lemma == eval_df.germalemma
eval_df.head()

print('wrong lemmata:')
print(eval_df.loc[~eval_df.match, ['token', 'pos', 'lemma', 'germalemma']])

match_mean, ci_lower, ci_upper = get_mean_and_ci(eval_df.match)

print('Success rate for germalemma: %.2f%% (95%% CI: [%.2f%%, %.2f%%])' % (match_mean, ci_lower, ci_upper))

eval_df['pattern'] = eval_df.apply(lambda row: lemma_via_patternlib(row[3], row[2]), axis=1)
eval_df['match_pattern'] = eval_df.lemma == eval_df.pattern
eval_df.head()
df_articles['Nouns'] = df_articles['Nouns'].apply(
    lambda x: [word for word in x if len(x) > 1])
# df_articles['Nounverbs'] = df_articles['Nounverbs'].apply(lambda x: [word for word in x if len(x)>1])

# Lemmatization
lemmatizer = GermaLemma()

# Lemmatization of Nouns
noun_list = df_articles['Nouns'].tolist()

global noun_lemma_list
noun_lemma_list = []
for doc in noun_list:
    noun_lemma_list.append([])
    for token in doc:
        token_lemma = lemmatizer.find_lemma(token.text, token.tag_)
        token_lemma = token_lemma.lower()
        noun_lemma_list[-1].append(token_lemma)

# Save to help df
df_help_noun_lemma_list = pandas.DataFrame({'x': noun_lemma_list})

# Create id increasing (needed to merge to original df)
df_help_noun_lemma_list.insert(0, 'ID_incr',
                               range(1, 1 + len(df_help_noun_lemma_list)))

# Merge df_help_noun_lemma_list to df_help_noun_lemma_list and rename
df_articles = (df_articles.merge(
    df_help_noun_lemma_list, left_on='ID_incr',
    right_on='ID_incr')).rename(columns={'x': 'Nouns_lemma'})
Example #8
0
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from germalemma import GermaLemma
import pickle
from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger

lemmatizer = GermaLemma()

# passing the word and the POS tag ("N" for noun)
with open('data/pos.pickle', 'rb') as f:
    tagger = pickle.load(f)

pos = tagger.tag(['Jungen', u'Wände', u'Wänden'])
print(pos)
for item in pos:
    w, p = item
    print(lemmatizer.find_lemma(w, p))
#lemma = lemmatizer.find_lemma(u'Jungen', u'N')
#print(lemma)
Example #9
0
    # build lemmatizer with tokens_a

    lemmata = defaultdict(dict)
    lemmata_lower = defaultdict(dict)
    for token, lemma, pos in tokens_a:
        GermaLemma.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma,
                                        pos)

    lemmatizer = GermaLemma(lemmata=lemmata, lemmata_lower=lemmata_lower)

    # test lemmatizer with tokens_b

    n_success = 0
    for token, true_lemma, pos in tokens_b:
        found_lemma = lemmatizer.find_lemma(token, pos)
        if found_lemma == true_lemma:
            n_success += 1
        elif found_lemma != token and token not in known_incorrect_lemmata_tokens:
            incorrect_lemmata.append((token, found_lemma, true_lemma))
            known_incorrect_lemmata_tokens |= {token}

    n_all = len(tokens_b)
    pct_success = n_success / n_all * 100
    print('%d / %d = %.2f%%' % (n_success, n_all, pct_success))

    pct_success_all_trials.append(pct_success)

print('')
print('success rate germalemma:')
print('%.2f%%' % (sum(pct_success_all_trials) / len(pct_success_all_trials)))
Example #10
0
def comment_to_topic(comment):

    # load and define stuff
    lemmatizer = GermaLemma()
    lemmas = []
    remove = [
        line.rstrip('\n')
        for line in open('reviews/add-stopwords.txt', encoding="utf-8")
    ]
    stop = stopwords.words('german')
    exclude_words = remove + stop
    exclude = {
        '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
        '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
        '{', '|', '}', '~'
    }

    with open('reviews/nltk_german_classifier_data.pickle', 'rb') as f:
        tagger = pickle.load(f)

    # sentence splitting
    comment = nltk.sent_tokenize(comment)

    lemmas = []

    for j in range(len(comment)):
        # tokenization
        comment[j] = nltk.word_tokenize(comment[j])

        # punctuation removal
        comment[j] = [
            token for token in comment[j]
            if token not in exclude and token.isalpha()
        ]

        # POS taging
        comment[j] = tagger.tag(comment[j])

        # lemmatization

        for k in range(len(comment[j])):
            try:
                lemmas.append(
                    lemmatizer.find_lemma(comment[j][k][0], comment[j][k][1]))
            except ValueError:
                pass

    # lower
    lemmas = [word.lower() for word in lemmas]

    # stopword removal
    topics = [word for word in lemmas if word not in exclude_words]

    # make topics html-safe
    topics_safe = [
        t.replace('ä', 'ae').replace('ü',
                                     'ue').replace('ö',
                                                   'oe').replace('ß', 'ss')
        for t in topics
    ]

    return topics, topics_safe
class SentimentDetector:
    def __init__(self, path: str = "src/data/", windowSize=5) -> None:
        self.path = path
        self.windowSize = windowSize

        self.df_aspect_tokens = None
        self.df_preprocessed = None
        self.df_lexicon = None

        self.lemmatizer = GermaLemma()

    def downloadLexicon(
        self,
        filename: str = "sentiment_lexicon.csv",
        url:
        str = "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv",
        chunk_size: int = 1024,
    ) -> None:
        """
        Download sentiment lexicon.

        Args:
            filename (str, optional):  Defaults to "sentimentLexicon.csv".
            url (str, optional):  Defaults to "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv".
            chunk_size (int, optional): Defines chunk size for downloads of bigger files. Defaults to 128.
        """
        r = requests.get(url, stream=True)

        file_size = int(r.headers.get("Content-Length", None))
        num_bars = NP.ceil(file_size / (chunk_size))

        downloadProgress = tqdm(total=num_bars,
                                desc="Downloading Lexicon...",
                                unit="B",
                                unit_scale=True)

        with open(self.path + filename, "wb") as fd:
            for chunk in r.iter_content(chunk_size=chunk_size):
                downloadProgress.update(len(chunk))
                fd.write(chunk)
        downloadProgress.close()

    def loadCSVs(
        self,
        tokenFilename: str = "data_aspects_tokens.csv",
        preprocessedFilename: str = "data_preprocessed.csv",
        lexiconFilename: str = "sentiment_lexicon.csv",
    ) -> bool:
        """
        load all necessary CSV for execution of the detector and set indices as appropriate

        Args:
            tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv".
            preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv".
            lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv".

        Returns:
            bool: successful execution

        """
        try:
            if self.df_aspect_tokens is None or self.df_aspect_tokens.empty:
                self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename)

                self.df_aspect_tokens["polarity_strength"] = PD.NaT
                self.df_aspect_tokens["polarity_strength"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["sentiment_words"] = PD.NaT
                self.df_aspect_tokens["sentiment_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["intensifier_words"] = PD.NaT
                self.df_aspect_tokens["intensifier_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[
                    "word_found"].str.replace(r"[^\w]*", "", regex=True)

                # TODO remove after debugging
                # self.df_aspect_tokens = self.df_aspect_tokens[:100]

            if self.df_preprocessed is None or self.df_preprocessed.empty:
                self.df_preprocessed = PD.read_csv(self.path +
                                                   preprocessedFilename)

                # pandas read_csv does not read arrays correctly so we need to adjust those
                tqdm.pandas(desc="Applying Datatype Transformations....")
                self.df_preprocessed["tokens"] = self.df_preprocessed[
                    "tokens"].progress_apply(lambda x: json.loads(x))

            if self.df_lexicon is None or self.df_lexicon.empty:
                if not os.path.exists(self.path + lexiconFilename):
                    self.downloadLexicon()

                self.df_lexicon = PD.read_csv(self.path + lexiconFilename)
                self.df_lexicon.drop_duplicates(subset=["word", "qualifier"],
                                                inplace=True)
                self.df_lexicon.set_index("word", inplace=True)
                self.df_lexicon.drop("%%")

            return True
        except IOError as e:
            print(e)
            return False

    def loadSpacyModel(
        self,
        model: str = "de_core_news_lg",
        disableList: list[str] = ["ner", "textcat"],
    ) -> bool:
        """
        load the spacy model with required modes

        Args:
            model (str, optional): name of the mode. Defaults to "de_core_news_sm".
            disableList (list[str], optional): list of things to be disabled. Defaults to ["tagger", "parser", "ner"].
        """
        try:
            self.nlp = spacy.load(model, disable=disableList)
            return True
        except OSError:
            print("Model not found. Attempting to download..")
            try:
                spacy.cli.download(model)
            except Exception as e:
                print(e)
                return False
            self.nlp = spacy.load(model, disable=disableList)
            return True

    def checkValidChild(self, child, childType: ChildType) -> bool:
        if childType == ChildType.DESCRIPTOR:
            if (child.tag_ == "ADJA"
                    and child.pos_ == "ADJ") or (child.pos_ == "ADV"
                                                 and child.tag_ == "ADJD"):
                return True
            return False
        elif childType == ChildType.INTENSIFIER:
            if child.pos_ == "ADJ" or child.pos_ == "ADV":
                return True
            return False
        else:
            print("Wrong childType.")
            return False

    def checkPolarityAdjective(self, child, rowIdx) -> float:
        """
        check if the given word has an entry in the sentiment lexicon and return given polarity strength

        Args:
            child (spacy.Token): tokenized word with tagged 'pos_' and 'text'

        Returns:
            pol_strength (float): polarity_strength of given word found in sentiment lexicon
        """

        child_normalized = child.text.replace(r"[^\w]*", "")

        lexEntry = self.checkLexicon(child_normalized)

        if lexEntry is None:
            lexEntry = self.checkLexicon(child_normalized.lower())

        if lexEntry is None:
            lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_)
            lexEntry = self.checkLexicon(lemma)

        if lexEntry is None:
            return 1

        if type(lexEntry["qualifier"]) == str:
            pol_strength = lexEntry["polarity_strength"]
            if lexEntry["qualifier"] == "NEG":
                return -pol_strength
            return pol_strength
        else:
            for i, qualifier in enumerate(lexEntry["qualifier"].values):
                if qualifier == "POS":
                    return lexEntry["polarity_strength"][i]
                if qualifier == "NEG":
                    return -lexEntry["polarity_strength"][i]
            return 0

    def checkLexicon(self, word) -> PD.Series:
        """
        Check for valid lexicon entries return None if not found

        Args:
            word (str): word to be use as key

        Returns:
            PD.Series: Series that is found for the given key or None
        """
        try:
            return self.df_lexicon.loc[word]
        except KeyError:
            return None

    def checkForIntensifier(self, child, rowIdx) -> float:
        """
        For a given spacy.Token (child) check if any of the children is an intensifier and if so, return their polarity_strength

        Args:
            child (spacy.Token): tokenized word with tagged 'pos_' and 'text'

        Returns:
            polarity_multiplier (float): polarity_multiplier of found intensifier word
        """
        child_normalized = child.text.replace(r"[^\w]*", "")
        # catch words that are not in the sentiment lexicon

        lexEntry = self.checkLexicon(child_normalized)

        if lexEntry is None:
            lexEntry = self.checkLexicon(child_normalized.lower())

        if lexEntry is None:
            lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_)
            lexEntry = self.checkLexicon(lemma)

        if lexEntry is None:
            return 1

        if type(lexEntry["qualifier"]) == str:
            if lexEntry["qualifier"] == "INT":

                self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                    child.text)
                return lexEntry["polarity_strength"]
            elif lexEntry["qualifier"] == "SHI":
                self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                    child.text)
                return -1
            else:
                return 1

        else:
            for i, qualifier in enumerate(lexEntry["qualifier"].values):
                # TODO currently the first qualifier found is taken, without considering which the most fitting one is
                if qualifier == "INT":

                    self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                        child.text)
                    return lexEntry["polarity_strength"][i]
                elif qualifier == "SHI":
                    self.df_aspect_tokens["intensifier_words"][rowIdx].append(
                        child.text)
                    return -1
            return 1

    def calcTotalPolarityStrength(self, child, rowIdx) -> float:
        """
        Calculate the total polarity for a given word

        Args:
            child (spacy.Token): the tokenized word with tagged 'pos_' and 'text'

        Returns:
            polarity_strength (float): the calculated polarity for the given word (child)
        """
        # lemma = self.lemmatizer.find_lemma(child.text, child.pos_)
        polarity_strength = self.checkPolarityAdjective(child, rowIdx)

        # find intensifier in children and multiply their strength to the polarity
        for c in child.children:
            if self.checkValidChild(c, ChildType.INTENSIFIER):
                polarity_strength *= self.checkForIntensifier(c, rowIdx)
        return polarity_strength

    def detectSentiment(self, rowDF: PD.Series) -> None:
        """
        Function to start the other relevent functions

        Args:
            rowDF (PD.Series): row of the Dataframe
        """
        doc = self.nlp(" ".join(self.df_preprocessed.iloc[
            rowDF["reviewnumber"]]["tokens"][rowDF["sent_idx"]]))

        for child in doc[rowDF["word_idx"]].children:
            # if child.tag_ == "ADJA":
            if self.checkValidChild(child, ChildType.DESCRIPTOR):
                pol_strength = self.calcTotalPolarityStrength(
                    child, rowDF.name)

                self.df_aspect_tokens["polarity_strength"][rowDF.name].append(
                    pol_strength)

                self.df_aspect_tokens["sentiment_words"][rowDF.name].append(
                    child.text)
                return

        for token in doc[rowDF["word_idx"]].ancestors:
            if token.pos_ == "AUX" or token.pos_ == "VERB":
                for child in token.children:
                    if self.checkValidChild(child, ChildType.DESCRIPTOR):
                        pol_strength = self.calcTotalPolarityStrength(
                            child, rowDF.name)

                        self.df_aspect_tokens["polarity_strength"][
                            rowDF.name].append(pol_strength)

                        self.df_aspect_tokens["sentiment_words"][
                            rowDF.name].append(child.text)
                        return

    def convert_polarity(self, qualifier, polarity):
        sentiment_polarity = []
        for i, elem in enumerate(qualifier):
            if elem == "NEG":
                sentiment_polarity.append(polarity[i] * -1)
            else:
                sentiment_polarity.append(polarity[i])
        sentiment_polarity = NP.mean(NP.array(sentiment_polarity))
        return sentiment_polarity

    def createReadableOutput(self, rowDF):
        appenddict = {
            "review_number":
            rowDF["reviewnumber"],
            "sentiment":
            self.convert_polarity(rowDF["qualifier"],
                                  rowDF["polarity_strength"]),
        }

        self.overall_sentiment = self.overall_sentiment.append(
            appenddict, ignore_index=True)

    def returnSentimentsforReviews(self) -> PD.DataFrame:
        self.overall_sentiment = PD.DataFrame(
            columns=["review_text", "sentiment"])
        tqdm.pandas(desc="Calculating Sentiments")
        self.df_aspect_tokens.progress_apply(
            lambda x: self.createReadableOutput(x), axis=1)

        self.overall_sentiment = (self.overall_sentiment.groupby(
            "review_number").mean().reset_index())
        # print(self.overall_sentiment)
        self.overall_sentiment["review_text"] = self.df_preprocessed[
            "text_normalized"][self.overall_sentiment["review_number"].astype(
                int).tolist()].tolist()

        return self.overall_sentiment

    def run(self) -> bool:
        """
        run all basic functions of the detector

        Returns:
            bool: successful execution of command
        """
        if not self.loadCSVs():
            print("Couldn't load CSV's.")
            return False

        if not self.loadSpacyModel():
            return

        true_labels = list()
        for index, row in self.df_aspect_tokens.iterrows():
            true_labels.append(self.df_preprocessed.iloc[row["reviewnumber"]][
                self.df_aspect_tokens.iloc[index]["aspect"]])
        self.df_aspect_tokens["true_label"] = true_labels

        tqdm.pandas(desc="Looking up Sentiments...")
        self.df_aspect_tokens.progress_apply(lambda x: self.detectSentiment(x),
                                             axis=1)

    def saveCSV(self, filename: str = "data_aspects_tokens.csv"):
        self.df_aspect_tokens["sentiment_words"] = self.df_aspect_tokens[
            "sentiment_words"].apply(lambda x: json.dumps(x))
        self.df_aspect_tokens.to_csv(self.path + filename, index=False)
Example #12
0
def main():
    # train
    if os.path.exists('./resources/nltk_german_classifier_data.pickle'):
        with open('./resources/nltk_german_classifier_data.pickle', 'rb') as f:
            print('./resources/nltk_german_classifier_data.pickle found')
            tagger = pickle.load(f)
    else:
        print(
            'could not find ./resources/nltk_german_classifier_data.pickle: training: IN PROGRESS'
        )
        tagger = train()
        with open('./resources/nltk_german_classifier_data.pickle', 'wb') as f:
            pickle.dump(tagger, f, protocol=2)
        print('training FINISHED')

    # tokenize
    if os.path.exists('./data/1.pickle'):
        with open('./data/1.pickle', 'rb') as f:
            print('1.pickle found')
            words = pickle.load(f)
    else:
        print('could not find 1.pickle: tokenizing: IN PROGRESS')
        document = open('./resources/logik-band-eins.txt').read()
        tok = Tokenizer()
        tokens = tok.tokenize(document)

        words = []
        i = 0
        for token in tokens:
            if i < 10000:
                v = token.value
                if len(v) > 1 and (not str.isdigit(v)) or True:
                    words.append(v)
                # i = i + 1
            else:
                break
        with open('./data/1.pickle', 'wb') as f:
            pickle.dump(words, f, protocol=2)
        print('tokenizing FINISHED')

    # tag
    if os.path.exists('./data/2.pickle'):
        with open('./data/2.pickle', 'rb') as f:
            print('2.pickle found')
            tagged_words = pickle.load(f)
    else:
        print('could not find 2.pickle: tagging: IN PROGRESS')
        tagged_words = tagger.tag(words)
        with open('./data/2.pickle', 'wb') as f:
            pickle.dump(tagged_words, f, protocol=2)

    # filter-in As, Ns, and Vs
    if os.path.exists('./data/3.pickle'):
        with open('./data/3.pickle', 'rb') as f:
            print('3.pickle found')
            filtered_words = pickle.load(f)
    else:
        print('could not find 3.pickle: filtering: IN PROGRESS')
        parts_of_speech = [
            'ADJA',
            'ADJD',
            'NN',
            'NN',
        ]
        filtered_words = list(
            filter(
                lambda word: word[1][0] == 'V' or any(
                    pos == word[1] for pos in parts_of_speech), tagged_words))
        with open('./data/3.pickle', 'wb') as f:
            pickle.dump(filtered_words, f, protocol=2)

    # lemmatize
    if os.path.exists('./data/4.pickle'):
        with open('./data/4.pickle', 'rb') as f:
            print('4.pickle found')
            lemmatized_words = pickle.load(f)
    else:
        print('could not find 4.pickle: lematization: IN PROGRESS')
        lemmatizer = GermaLemma()
        lemmatized_words = []
        for word in filtered_words:
            try:
                lemmatized_words.append(lemmatizer.find_lemma(
                    word[0], word[1]))
            except:
                w = word[0]
                l = word[1]
                print(f"EXCEPT: {w} {l}")
                continue
        with open('./data/4.pickle', 'wb') as f:
            pickle.dump(lemmatized_words, f, protocol=2)

    # filter-out modals
    f = open('./resources/modal-words.txt', 'r')
    modal_words = f.read().splitlines()[:1000]
    non_modals = [item for item in lemmatized_words if item not in modal_words]
    # non_modals = list(filter(lambda word: not any(modal == word for modal in modals), lemmatized_words))
    # modals = []
    # line = f.readline()
    # modals.append(line)
    # while line:
    #     line = f.readline()
    #     modals.append(line)

    for pair in Counter(non_modals).most_common(30):
        print(pair[0] + " " + str(pair[1]))
Example #13
0
class _PreprocWorker(mp.Process):
    def __init__(self,
                 worker_id,
                 docs,
                 language,
                 tasks_queue,
                 results_queue,
                 tokenizer,
                 stemmer,
                 lemmata_dict,
                 pos_tagger,
                 group=None,
                 target=None,
                 name=None,
                 args=(),
                 kwargs=None):
        super(_PreprocWorker, self).__init__(group, target, name, args, kwargs
                                             or {})
        logger.debug('worker `%s`: init with worker ID %d' % (name, worker_id))
        logger.debug('worker `%s`: docs = %s' % (name, str(set(docs.keys()))))
        self.worker_id = worker_id
        self.docs = docs
        self.language = language
        self.tasks_queue = tasks_queue
        self.results_queue = results_queue

        # set a tokenizer
        self.tokenizer = tokenizer  # tokenizer instance (must have a callable attribute `tokenize` with a document
        # text as argument)

        # set a stemmer
        self.stemmer = stemmer  # stemmer instance (must have a callable attribute `stem`)

        # set a POS tagger
        self.pos_tagger = pos_tagger  # POS tagger instance (must have a callable attribute `tag`)

        self.lemmata_dict = lemmata_dict
        self.pattern_module = None  # dynamically loaded CLiPS pattern library module
        self.germalemma = None  # GermaLemma instance
        self.wordnet_lemmatizer = None  # nltk.stem.WordNetLemmatizer instance

        self._tokens = {
        }  # tokens for this worker at the current processing stage. dict with document label -> tokens list
        self._ngrams = {}  # generated ngrams

        #self._filtered = False
        self._orig_tokens = None  # original (unfiltered) tokens, when filtering is currently applied

    def run(self):
        logger.debug('worker `%s`: run' % self.name)

        for next_task, task_kwargs in iter(self.tasks_queue.get, None):
            logger.debug('worker `%s`: received task `%s`' %
                         (self.name, next_task))

            exec_task_fn = getattr(self, '_task_' + next_task)
            if exec_task_fn:
                exec_task_fn(**task_kwargs)
            else:
                raise NotImplementedError("Task not implemented: `%s`" %
                                          next_task)

            self.tasks_queue.task_done()

        logger.debug('worker `%s`: shutting down' % self.name)
        self.tasks_queue.task_done()

    def _put_items_in_results_queue(self, container):
        if container:
            logger.debug('worker `%s`: putting %d results in queue' %
                         (self.name, len(container)))
            for pair in container.items():
                self.results_queue.put(pair)
        else:
            # we *have* to put something in the result queue -> signal that we return "nothing"
            logger.debug('worker `%s`: putting None in results queue' %
                         self.name)
            self.results_queue.put(None)

    def _task_get_tokens(self):
        self._put_items_in_results_queue(self._tokens)

    def _task_get_tokens_with_worker_id(self):
        self.results_queue.put((self.worker_id, self._tokens))

    def _task_get_ngrams(self):
        self._put_items_in_results_queue(self._ngrams)

    def _task_get_ngrams_with_worker_id(self):
        self.results_queue.put((self.worker_id, self._ngrams))

    def _task_get_vocab_doc_freq(self):
        counts = Counter()
        for dt in self._tokens.values():
            counts.update(set(ith_column(dt)))
        self.results_queue.put(counts)

    def _task_get_state(self):
        logger.debug('worker `%s`: getting state' % self.name)

        state_attrs = ('docs', 'language', '_tokens', '_ngrams',
                       '_orig_tokens')

        state = {attr: getattr(self, attr) for attr in state_attrs}
        logger.debug('worker `%s`: got state with %d items' %
                     (self.name, len(state)))
        self.results_queue.put(state)

    def _task_set_tokens(self, tokens):
        logger.debug('worker `%s`: setting tokens' % self.name)
        self._tokens = tokens

    def _task_set_ngrams(self, ngrams):
        logger.debug('worker `%s`: setting ngrams' % self.name)
        self._ngrams = ngrams

    def _task_set_state(self, **state):
        logger.debug('worker `%s`: setting state' % self.name)

        for attr, val in state.items():
            setattr(self, attr, val)

    def _task_tokenize(self):
        self._tokens = {
            dl: tuplize(self.tokenizer.tokenize(txt))
            for dl, txt in self.docs.items()
        }

    def _task_generate_ngrams(self, n, join=True, join_str=' '):
        self._ngrams = {
            dl: create_ngrams(ith_column(dt),
                              n=n,
                              join=join,
                              join_str=join_str)
            for dl, dt in self._tokens.items()
        }

    def _task_use_ngrams_as_tokens(self, join=False, join_str=' '):
        if join:
            new_tok = {
                dl: tuplize([join_str.join(g_tuple) for g_tuple in dg])
                for dl, dg in self._ngrams.items()
            }
        else:
            new_tok = {dl: tuplize(dg) for dl, dg in self._ngrams.items()}

        self._tokens = new_tok

    def _task_transform_tokens(self, transform_fn):
        self._tokens = {
            dl: apply_to_mat_column(dt, 0, transform_fn) if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_stem(self):
        self._tokens = {
            dl: apply_to_mat_column(dt, 0, lambda t: self.stemmer.stem(t))
            if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_pos_tag(self):
        self._tokens = {
            dl: apply_to_mat_column(
                dt, 0, self.pos_tagger.tag, map_func=False, expand=True)
            if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_lemmatize(self,
                        pos_tagset,
                        use_dict=False,
                        use_patternlib=False,
                        use_germalemma=None):
        tmp_lemmata = defaultdict(list)

        if use_germalemma is None and self.language == 'german':
            use_germalemma = True

        if use_germalemma:
            if not self.germalemma:
                self.germalemma = GermaLemma()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    try:
                        l = self.germalemma.find_lemma(t, pos)
                    except ValueError:
                        l = t
                    tmp_lemmata[dl].append(l)
        else:
            if use_dict and self.lemmata_dict:
                for dl, tok_tags in self._tokens.items():
                    for t, pos in tok_tags:
                        pos = simplified_pos(pos, tagset=pos_tagset)

                        if pos:
                            l = self.lemmata_dict.get(pos, {}).get(t, None)
                            if l == '-' or l == '':
                                l = None
                        else:
                            l = None
                        tmp_lemmata[dl].append(l)

            if use_patternlib:
                if not self.pattern_module:
                    if self.language not in PATTERN_SUBMODULES:
                        raise ValueError(
                            "no CLiPS pattern module for this language:",
                            self.language)

                    modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language]
                    self.pattern_module = import_module(modname)

                for dl, tok_tags in self._tokens.items():
                    tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))

                    lemmata_final = []
                    for (t, pos), t_found in zip(tok_tags, tok_lemmata):
                        l = t_found

                        if l is None:
                            if pos.startswith('NP'):  # singularize noun
                                l = self.pattern_module.singularize(t)
                            elif pos.startswith('V'):  # get infinitive of verb
                                l = self.pattern_module.conjugate(
                                    t, self.pattern_module.INFINITIVE)
                            elif pos.startswith('ADJ') or pos.startswith(
                                    'ADV'
                            ):  # get baseform of adjective or adverb
                                l = self.pattern_module.predicative(t)

                        lemmata_final.append(l)

                    tmp_lemmata[dl] = lemmata_final

        if len(tmp_lemmata) == 0:
            if not self.wordnet_lemmatizer:
                self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

            for dl, tok_tags in self._tokens.items():
                for t, pos in tok_tags:
                    wn_pos = pos_tag_convert_penn_to_wn(pos)
                    if wn_pos:
                        l = self.wordnet_lemmatizer.lemmatize(t, wn_pos)
                    else:
                        l = t
                    tmp_lemmata[dl].append(l)

        # merge
        lemmatized_tokens = {}
        for dl, tok_tags in self._tokens.items():
            tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags))
            new_tok_tags = [(l or t, pos)
                            for (t, pos), l in zip(tok_tags, tok_lemmata)]
            assert len(new_tok_tags) == len(tok_tags)
            lemmatized_tokens[dl] = new_tok_tags

        assert len(lemmatized_tokens) == len(self._tokens)
        self._tokens = lemmatized_tokens

    def _task_expand_compound_tokens(self,
                                     split_chars=('-', ),
                                     split_on_len=2,
                                     split_on_casechange=False):
        tmp_tokens = {}
        for dl, dt in self._tokens.items():
            nested = [
                expand_compound_token(tup[0], split_chars, split_on_len,
                                      split_on_casechange) for tup in dt
            ]
            tmp_tokens[dl] = tuplize(flatten_list(nested))

        self._tokens = tmp_tokens

    def _task_remove_special_chars_in_tokens(self, special_chars):
        self._tokens = {
            dl: apply_to_mat_column(
                dt,
                0,
                lambda x: remove_special_chars_in_tokens(x, special_chars),
                map_func=False) if dt else []
            for dl, dt in self._tokens.items()
        }

    def _task_clean_tokens(self,
                           tokens_to_remove,
                           save_orig_tokens=False,
                           remove_shorter_than=None,
                           remove_longer_than=None,
                           remove_numbers=False):
        if save_orig_tokens:
            self._save_orig_tokens()

        if remove_shorter_than is not None:
            self._tokens = {
                dl: [t for t in dt if len(t[0]) >= remove_shorter_than]
                for dl, dt in self._tokens.items()
            }

        if remove_longer_than is not None:
            self._tokens = {
                dl: [t for t in dt if len(t[0]) <= remove_longer_than]
                for dl, dt in self._tokens.items()
            }

        if remove_numbers:
            self._tokens = {
                dl: [t for t in dt if not t[0].isnumeric()]
                for dl, dt in self._tokens.items()
            }

        if type(
                tokens_to_remove
        ) is not set:  # using a set is much faster than other sequence types for "in" tests
            tokens_to_remove = set(tokens_to_remove)

        self._tokens = {
            dl: [t for t in dt if t[0] not in tokens_to_remove]
            for dl, dt in self._tokens.items()
        }

    def _task_filter_for_token(self,
                               search_token,
                               match_type='exact',
                               ignore_case=False,
                               glob_method='match',
                               remove_found_token=False):
        self._save_orig_tokens()

        self._tokens = filter_for_token(self._tokens,
                                        search_token,
                                        match_type=match_type,
                                        ignore_case=ignore_case,
                                        glob_method=glob_method,
                                        remove_found_token=remove_found_token,
                                        remove_empty_docs=False)

    def _task_filter_for_pos(self,
                             required_pos,
                             pos_tagset,
                             simplify_pos=True):
        self._save_orig_tokens()
        self._tokens = filter_for_pos(self._tokens,
                                      required_pos,
                                      simplify_pos=simplify_pos,
                                      simplify_pos_tagset=pos_tagset)

    def _task_reset_filter(self):
        self._tokens = self._orig_tokens
        self._orig_tokens = None

    def _save_orig_tokens(self):
        if self._orig_tokens is None:  # initial filtering -> safe a copy of the original tokens
            self._orig_tokens = deepcopy(self._tokens)
Example #14
0
class SentiDep:
    def __init__(self, **kwargs):
        """
            Sentiment-Analyzer for german texts.
            Get the polarity values of words depending on
            polarity values of associated descriptive words
            e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne'

            Purpose: find out in which sentiment context your keywords appear in a text.
            Note: Works with spacy, nltk and germalemma
        """
        sentiws_path = kwargs.get(
            'sentiws_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/sentiws.pickle"))
        polarity_mod_path = kwargs.get(
            'polarity_modifiers_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/polarity_modifiers.pickle"))
        negations_path = kwargs.get(
            'negations_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/negationen_lexicon.pickle"))
        stts_path = kwargs.get(
            'stts_file',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data/stts.pickle"))
        self.sentiws = pickle.load(open(sentiws_path, 'rb'))
        self.polarity_modifications = pickle.load(open(polarity_mod_path,
                                                       'rb'))
        self.negations = pickle.load(open(negations_path, 'rb'))
        self.nlp = spacy.load("de_core_news_md")
        self.germalemmatizer = GermaLemma()
        self.stts = pickle.load(open(stts_path, 'rb'))
        self.german_stops = stopwords.words('german')

    def tokenize(self, text):
        """
        Tokenize a string using spacy's tokenizer.
        Input: text/string
        Output: spacy_doc
        """
        return self.nlp(text)

    def sentiws_spacy_tag_mapper(self, pos_tag, **kwargs):
        """
        Function for mapping SentiWS POS-tags to spacy POS-tags and reverse.
        Input: pos_tag, optional: direction
               -> values: 1 (sentiws to spacy), -1 (spacy to sentiws)
               -> default: 1
        Output: python str
        """
        direction = kwargs.get('direction', 1)
        senti_map = {
            "ADJX": "ADJ",
            "ADV": "ADV",
            "NN": "NOUN",
            "VVINF": "VERB"
        }
        if direction > 0:
            return senti_map[pos_tag]
        elif direction < 0:
            return {value: key for key, value in senti_map.items()}[pos_tag]

    def get_polarity(self, word, pos_tag):
        """
        Getter Function for retaining the polarity value by SentiWS for a certain word with POS-tag.
        Input: word, pos_tag
        Output: tuple(word, polarity-value, pos_tag)
        """
        senti_words = list(
            filter(
                lambda x: x[0] == word and self.sentiws_spacy_tag_mapper(x[2])
                == pos_tag, self.sentiws))
        if senti_words:
            senti_words = sorted(senti_words,
                                 key=lambda y: y[1]**2,
                                 reverse=True)[0]
            return senti_words

    def modify_polarity(self, child, polarity):
        """
        Function to consider polarity enhancer and reducer.
        Input: token.text, token.child.text, token.pos_ (of word)
        Output: tuple(word, polarity-value, pos_tag)
        """
        senti_word = polarity
        if senti_word:
            if child in self.polarity_modifications["polarity_enhancer"]:
                return (senti_word[0], senti_word[1] * 1.5, senti_word[2])
            elif child in self.polarity_modifications["polarity_reducer"]:
                return (senti_word[0], senti_word[1] * 0.5, senti_word[2])

    def easy_switch(self, word):
        """
        Function for finding depending negations without dealing with complex issues.
        Input: token/word
        Output: True/False
        """
        neg_search = [
            re.search(r'%s' % (n), word)
            for n in self.negations["negation_regex"]
        ]
        neg_search = list(filter(lambda z: z != None, neg_search))
        return bool(neg_search)

    def add_polarities(self, list_of_polarity_tuples):
        """
        Summing up a list of polarity-tuples
        :param list_of_polarity_tuples:
        :return: polarity value -> float
        """
        all_pols = [lpt[1] for lpt in list_of_polarity_tuples]
        return sum(all_pols)

    def calc_parent_polarity(self, spacy_token, token_polarity,
                             children_polarities):
        """
        Calculating the parent polarity value depending on the children polarities
        :param spacy_token:
        :param token_polarity:
        :param children_polarities:
        :return: parent_polarity -> tuple(word, polarity, POS-tag)
        """
        if token_polarity and children_polarities:
            added_children_polarities = self.add_polarities(
                children_polarities)
            if added_children_polarities > 0:
                token_polarity = (spacy_token.text, token_polarity[1] +
                                  added_children_polarities, spacy_token.pos_)
            elif added_children_polarities < 0:
                token_polarity = (spacy_token.text,
                                  (token_polarity[1] +
                                   (-1 * added_children_polarities)) * (-1),
                                  spacy_token.pos_)
        elif not token_polarity and children_polarities:
            token_polarity = (spacy_token.text,
                              self.add_polarities(children_polarities),
                              spacy_token.pos_)
        return token_polarity

    def switch_polarity(self, polarity, spacy_doc_sent):
        """
        Switching polarity value depending on negation context of whole sentence.
        Classic negation (kein, nicht, ...) are recognized as well as
        negation stops (aber, obwohl, ...)
        :param polarity:
        :param spacy_doc_sent:
        :return: tuple(word, polarity, POS-tag, negation: boolean)
        """
        negation_trigger = False
        for i, token in enumerate(spacy_doc_sent):
            for negex in self.negations['negation_regex']:
                regex = r'%s' % (negex)
                negation_search = re.search(regex, token.text, re.I)
                if negation_search:
                    negation_trigger = not negation_trigger
            if token.lower_ in self.negations['polarity_switches']:
                if token.text == '.':
                    if token.pos_ == 'PUNCT':
                        negation_trigger = not negation_trigger
                    else:
                        continue
                else:
                    negation_trigger = not negation_trigger
            if token.text == polarity[0]:
                if negation_trigger:
                    negated_polarity = (polarity[0], -polarity[1], polarity[2],
                                        "negation: " + str(negation_trigger))
                else:
                    negated_polarity = (polarity[0], polarity[1], polarity[2],
                                        "negation: " + str(negation_trigger))
                return negated_polarity

    def get_depending_polarities(self, text, keywords):
        """
        Get keyword associated polarity values of german texts.
        Polarity analysis including polarity reducer/enhancer and negations
        :param text:
        :param keywords:
        :return: Context-polarity value of keywords -> list of tuples
        """
        spacy_doc = self.nlp(text, disable=['ner', 'textcat'])
        parent_polarities = []
        keywords = [k.lower() for k in keywords]
        for sent in spacy_doc.sents:
            for i, token in enumerate(sent):
                token_polarity = self.get_polarity(token.text, token.pos_)
                children_polarities = []
                if token.lower_ in keywords:
                    children = token.children
                    if children:
                        for child in children:
                            child_polarity = self.get_polarity(
                                child.text, child.pos_)
                            if child_polarity:
                                children_polarities.append(child_polarity)
                    parent_polarity = self.calc_parent_polarity(
                        token, token_polarity, children_polarities)
                    if parent_polarity:
                        modified_parent_polarities = []
                        for child in children:
                            modified_parent_polarities.append(
                                self.modify_polarity(child, parent_polarity))
                        added_modified_parent_polarity = None
                        if modified_parent_polarities:
                            added_modified_parent_polarity = self.add_polarities(
                                modified_parent_polarities)
                        if added_modified_parent_polarity:
                            added_modified_parent_polarity = (
                                token.text, added_modified_parent_polarity,
                                token.pos_ + "_modified")
                            parent_polarities.append(
                                self.switch_polarity(
                                    added_modified_parent_polarity, sent))
                        else:
                            parent_polarities.append(
                                self.switch_polarity(parent_polarity, sent))
        parent_polarities = [(term.lower(), t_pol, t_pos, neg)
                             for term, t_pol, t_pos, neg in parent_polarities]
        return parent_polarities

    def lemmatize(self, spacy_token):
        """
        Lemmatizer using stts-tagset, spacy-token and GermaLemma.
        Input: spacy token -> german model
        Output: python str
        """
        tag = spacy_token.tag_
        if tag.startswith(('N', 'V', 'ADJ', 'ADV')) and tag in self.stts:
            return self.germalemmatizer.find_lemma(spacy_token.text, tag)
        else:
            return spacy_token.text

    def generate_topics(self, texts, num_topics=10):
        """
        Generate a list with 30 most frequent nouns in a text.
        Input: text -> len(text) <= 50000
        Output: nltk.FreqDist-object
        """
        tokens = [[token for token in self.tokenize(text)] for text in texts]
        tokens = [[self.lemmatize(t) for t in token if t.pos_ == 'NOUN'\
                  and t.lower_ not in self.german_stops] for token in tokens]
        docs = [" ".join(t) for t in tokens]
        cv = CountVectorizer(max_df=0.85, max_features=10000)
        word_count_vector = cv.fit_transform(docs)
        tf = TfidfTransformer(smooth_idf=True, use_idf=True)
        tf.fit(word_count_vector)
        feature_names = cv.get_feature_names()
        tf_idf_scores = []
        for doc in docs:
            cv_vector = cv.transform([doc])
            tf_idf_vector = tf.transform(cv_vector)
            sorted_items = self.sort_coo(tf_idf_vector.tocoo())
            keywords, scores = self.extract_topn_from_vector(
                feature_names, sorted_items, 10)
            tf_idf_scores += list(zip(keywords, scores))

        tfidf_topics = sorted(tf_idf_scores, key=lambda x: x[1], reverse=False)
        return dict(tfidf_topics[:num_topics])

    def sort_coo(self, coo_matrix):
        tuples = zip(coo_matrix.col, coo_matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

    def extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
        sorted_items = sorted_items[:topn]
        score_vals = []
        feature_vals = []

        for idx, score in sorted_items:
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]
        return results, score_vals

    def create_clinic_polarity_dict(self, key_list, topics):
        """
        Compute polarity scores document-wise
        :param key_list: list of polarity-scores and document-key
                         -> form: [[polarity-scores_1, document-key_1] ...]
                         -> hint: simple pandas dump with
                            df[[polarity-values, document]].values.tolist()
        :param topics: list of keywords associated with a certain topic
        :return: polarities_dict in form:
                 {document_key_1: polarities_1, ...}
        """
        polarities = {}
        clinic_counter = {}
        for rl in tqdm(key_list):
            if not rl[1] in clinic_counter.keys():
                clinic_counter[rl[1]] = 1
            key = f'{rl[1]}_{clinic_counter[rl[1]]}'
            polarities[key] = self.get_depending_polarities(rl[0], topics)
            clinic_counter[rl[1]] += 1
        return polarities

    def create_polarity_df(self, polarities, topics):
        """
        Transforms polarity-scores from 'create_clinic_polarity_dict' output
        to a formatted pandas dataframe
        :param polarities: polarities-dict (output from 'create_clinic_polarity_dict')
        :param topics: list of keywords associated with a certain topic
        :return: polarity_df (formatted pandas dataframe) of form:
                 columns: keywords/topics
                 rows: document-keys
                 values: float(polarity-scores) or np.nan
        """
        filtered_polarities = [(clinic, polarity)
                               for clinic, polarity in polarities.items()
                               if polarity]
        columns = {t: [] for t in topics}
        ids = {"Klinik": []}
        for clinic, polarity in tqdm(filtered_polarities):
            ids["Klinik"].append(clinic)
            row = {t: [] for t in topics}
            for pol in polarity:
                row[pol[0].lower()] = pol[1]
            for word, p in row.items():
                if not p:
                    columns[word].append(np.nan)
                else:
                    columns[word].append(p)
        for key, value in columns.items():
            if len(value) < len(ids["Klinik"]) or len(value) > len(
                    ids["Klinik"]):
                raise ValueError("Values in dict must have same length!")

        polarity_df = pd.DataFrame(data=columns, index=ids["Klinik"])
        return polarity_df

    '''