def annotate_stw(self, t, clf_class, majority_classes=None): """ Method for annotating a segment with one of the classes speech, thought or writing given the STWR classification clf_class. :param t: The text of the segment. :param clf_class: One of direct, indirect, free_indirect, reported. The predicted class for t. :param majority_classes: A dictionary containing the majority classes (one of speech, thought or writing) for each STWR class. :return: One of speech, thought or writing; the annotation for t. """ # Get the stored majority classes if no other are given if not majority_classes: majority_classes = self.majority_classes # Direct and free_indirect should always be classified by majority classes as reporting words are more # likely to appear outside of segments of these classes. if clf_class in ['direct', 'free_indirect']: return majority_classes[clf_class] # For the other types check for reporting words with unambiguous type else use majority class doc = NLP(t) # Get lemmata with germalemma as spacy is not good at this lemmatizer = GermaLemma() lemmata = [] for token in doc: if token.pos_ == "VERB": lemmata.append(lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": lemmata.append(lemmatizer.find_lemma(token.text, 'N')) if len(lemmata) > 0: stw_words_t = pd.concat([ self.stw_words[self.stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))] for lemma in lemmata ], axis=0, ignore_index=True) else: stw_words_t = [] if len(stw_words_t) == 1: if stw_words_t["Type"][0] in ["speech", "thought", "writing"]: return stw_words_t["Type"][0] else: return majority_classes[clf_class] else: return majority_classes[clf_class]
def lemmatize_tokens(tokens): lemmatizer = GermaLemma() new_tokens = {} for doc_label, tok_pos in tokens.items(): lemmata_pos = [] for t, pos in tok_pos: try: l = lemmatizer.find_lemma(t, pos) except ValueError: l = t lemmata_pos.append((l, pos)) new_tokens[doc_label] = lemmata_pos return new_tokens
class GermaLemma(PipelineModule): def __init__(self, pos_prereq): self.pos_prereq = pos_prereq self.lemmatizer = GermaLemma( tiger_corpus= 'resources/tiger_release_aug07.corrected.16012013.conll09') def targets(self): return {'lemma-germalemma'} def prerequisites(self): return {'token', self.pos_prereq} def make(self, prerequisite_data): tokens = prerequisite_data['token'] pos = prerequisite_data[self.pos_prereq] pattern1 = re.compile("^[NV]") pattern2 = re.compile("^(ADJ|ADV)") def lemmatize_token(t, postag): try: if pattern1.match(postag): return self.lemmatizer.find_lemma(t, postag) elif pattern2.match(postag): return self.lemmatizer.find_lemma(t, postag[:3]) else: return 0 except Exception as e: sys.stderr.write( f"Lemmatizing {t} ({postag}) raised exception: {e}\n") return 0 return { 'lemma-germalemma': list(map(lambda x: lemmatize_token(x[0], x[1]), zip(tokens, pos))) }
class STWRFeatureExtractor(object): """ Feature extractor for classifiying STWR. """ def __init__(self, sequence_features=True): """ :param sequence_features: If true, use the sequence features (trained on gold labels). """ # Number of features self.num_features = 243 # Names of features - needed for feature inspection self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM", "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA", "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS", "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN", "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT", "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE", "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART", "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY", "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF", "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS", "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP", "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL", "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV", "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ", "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN", "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end", "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes", "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3", "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind", "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden", "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker", "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1", "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1", "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun", "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2", "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1", "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5", "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5", "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5", "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1", "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5", "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4", "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2", "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1", "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5", "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1", "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5", "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3", "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1", "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5", "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4", "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0", "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4", "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2", "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5", "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg", "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect", "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported", "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported", "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars", "paragraph", "prev_paragraph"] # Switch to turn off sequence features self.sequence_features = sequence_features if not self.sequence_features: self.feature_names = self.feature_names[:-21] + self.feature_names[-8:] # Get all possible tags self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys()) self.pos_map = sorted(spacy.parts_of_speech.NAMES.values()) # Set up lemmatizer self.lemmatizer = GermaLemma() # Set up RFTagger call(["make"], cwd="RFTagger/src") # Load word vectors print("Loading word-vectors. This may take a while ...") self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True) print("Done.\n") def transform(self, text, original_text = None, backlog=[]): """ Method that transforms the given segments into their feature representation. Expects dataframe with column ["text"] or list of spacy tokens along with the original text or string. :param text: dataframe with column ["text"] that contains the string segments or list of spacy tokens. :param original_text: the original text as string is passed in test mode. :param backlog: For test mode, the backlog stores info and labels of former segments and therefore has to be passed back and forth between classifier and feature extractor. :return: The transformed segments as pandas Dataframe or list, depending on the type of 'text' """ # If the backlog has not been initialized, initialize it if len(backlog) == 0: backlog = ["" for i in range(10)] + [0 for i in range(64)] # If spacy tokenization and quote annotation has not been performed, do it now if type(text) == list: tokens = text elif type(text) == pd.DataFrame: # Get full text for better results in spacy parsing full_text = " ".join(text['text'].values) doc = NLP(full_text) # Exchange tags for quotation marks for special tokens: #OPEN_QUOTE#, #CLOSE_QUOTE# doc = annotate_quotes(doc) tokens_full_text = [token for token in doc] # Transform individual segments if type(text) == list: return self.transform_segment(tokens, backlog, original_text) else: output = pd.DataFrame() print("Extracting features...") for ind, row in text.iterrows(): # print progress bar sys.stdout.write('\r') # the exact output you're looking for: sys.stdout.write("[%-20s] %d%%" % ('=' * round(ind/(len(text)/20)), round(ind/(len(text)/100)))) sys.stdout.flush() # Get the tokens corresponding to the segment: tokens_text = string_tokenize(row['text']) tokens = tokens_full_text[:len(tokens_text)] # Check that this is correct assert tokens_text[-1] == tokens[-1].text tokens_full_text = tokens_full_text[len(tokens_text):] transformed, backlog = self.transform_segment(tokens, backlog, row['text']) output = output.append(pd.Series(transformed), ignore_index = True) # Adapt backlog: backlog stores last ten classifications in the first ten positions backlog[0:10] = backlog[1:10] + [row['labels_spans']] return output, backlog def transform_segment(self, tokens, backlog, original_text): """ Transforms an individual segment of tokens, given the information in the backlog, into a feature representation. :param tokens: list of spacy tokens :param backlog: list containing information about the labels and other features of previous segments :param original_text: The original text as string :return: the feature representation and the updated backlog """ # --- Preprocessing --- transformed = [] token_strings = [token.text for token in tokens] # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV token_lemmata = [] for token in tokens: if token.pos_ == "VERB": token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'N')) elif token.pos_ in ["ADJ", "ADV"]: token_lemmata.append(self.lemmatizer.find_lemma(token.text, token.pos_)) else: token_lemmata.append(token.text) # Load reporting word list stw_words_orig = pd.read_excel("data/stw_words/stw_words_brunner2015.xls") # Some words are only usable for reported class stw_words_rep = stw_words_orig[stw_words_orig['Marker'] == 'rep'] stw_words = stw_words_orig[stw_words_orig['Marker'] != 'rep'] # Do deeper morphological analysis with RFTagger file = open("RFTagger/temp.txt", "w") file.write("\n".join(token_strings)) file.close() morph_tagged = check_output(["src/rft-annotate", "lib/german.par", "temp.txt"], cwd="RFTagger", stderr=FNULL).decode( "utf-8").split("\n") # Split morph tags into attributes morph_tagged = [morph_tag.split("\t")[1].split(".") if morph_tag != "" else morph_tag for morph_tag in morph_tagged] # --- Pos tag features --- tags = [token.tag_ for token in tokens] pos = [token.pos_ for token in tokens] transformed += [(tags.count(tag)/len(tags)) if tag in tags else 0 for tag in self.tag_map] transformed += [(pos.count(p) / len(pos)) if p in pos else 0 for p in self.pos_map] # --- NE features --- doc = NLP(original_text) transformed.append(len(doc.ents)) for ne_type in NE_TYPES: transformed.append(int(len([ent for ent in doc.ents if ent.label_ == ne_type]) > 0)) # --- Special token features --- # Colon in this or in previous segment? colon_this = int(":" in token_strings) transformed.append(colon_this) transformed.append(backlog[10]) # Comma at the end of this segment means that the next segment is an embedded sentence if it has a verb comma_end = int(tags[-1] == '$,') transformed.append(comma_end) # Percentage of 'emphatic' punctuation marks: ?,!,-,– transformed.append((token_strings.count('?') + token_strings.count('!') + token_strings.count('-') + token_strings.count('–'))/len(token_strings)) # Question? transformed.append(int((token_strings.count('?') > 0))) # Quotes features # Opening Quotes in this segment? open_quote = len([tag for tag in tags if tag == "#OPEN_QUOTE#"]) # Closing Quotes in this segment? close_quote = len([tag for tag in tags if tag == "#CLOSE_QUOTE#"]) # In quotes? in_quotes = int(backlog[11] > 0 or open_quote > 0) transformed.append(open_quote) transformed.append(close_quote) transformed.append(in_quotes) # How many contiguous prev. segments have been in quotes so far? This is meant to tackle errors bc of missing closing quotes # as well as marking sequences of embedded narration transformed.append(backlog[49]) # Special combinations direct - full quoted sentence (sent. ending punct. before closing quotes), # comma after closing quotes (prob. frame of direct speech) transformed.append(int(len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i > 0 and tags[i-1] == "$."]) > 0)) transformed.append(int((backlog[12] == 1 and token_strings[0] == ",") or (len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i < len(token_strings)-1 and token_strings[i+1] == ","]) > 0))) # --- Morphological Features --- # percentage of first and second person pronouns (personal, possessive, reflexive) per1 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '1'] per2 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '2'] per12 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] in ['1', '2']] transformed.append(len(per1) / len(token_strings)) # Second person might be a better feature than 1. and 2. together as it is seldom the perspective of a narrative transformed.append(len(per2) / len(token_strings)) transformed.append(len(per12)/len(token_strings)) # percentage of third person pronouns (personal, possessive, reflexive) per3 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '3'] transformed.append(len(per3) / len(token_strings)) # Note changes in the usage of person; this might help to distinguish between third and first person perspective narratives # Only third person in prev. five segments? transformed.append(int(len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b in ['1', '1_3']]) == 0)) # Only first person in prev. five segments? transformed.append(int(len([b for b in backlog[43:48] if b == '1']) > 0 and len([b for b in backlog[43:48] if b in ['3', '1_3']]) == 0)) # Mixed first and third person in prev. five segments transformed.append(int(len([b for b in backlog[43:48] if b == '3_1']) > 0 or (len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b == '1']) > 0))) # tempus and modus features has_ind = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[5] == 'Ind']) > 0) has_subj = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[5] == 'Subj']) > 0) no_subj = int(not any([morph_tag[5] == 'Subj' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) no_ind = int(not any([morph_tag[5] == 'Ind' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) has_pres = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[4] == 'Pres']) > 0) has_past = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[4] == 'Past']) > 0) no_past = int(not any([morph_tag[4] == 'Past' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) no_pres = int(not any([morph_tag[4] == 'Pres' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) for feature in [has_ind, has_subj, no_subj, no_ind, has_pres, has_past, no_past, no_pres]: transformed.append(feature) # --- Grammatical features --- # Comma at the end of the prev. segment means that this segment is an embedded sentence if it has a verb if backlog[13] and any([tag in ['VFIN', 'VAFIN'] for tag in tags]): transformed.append(1) else: transformed.append(0) # A form of verb 'würden' + infinitive can be a pointer towards free indirect transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata]) and any( [(tag in ['VAINF', 'VMINF', 'VVINF', 'VVIZU'] and token_lemmata[i] != 'würden') for i, tag in enumerate(tags)]))) transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata]))) # Noun/prepositional complements of a rep. word point toward reported STW, # sentence/infinitive complements point towards indirect STW all_stw_words = [token for i,token in enumerate(tokens) if any(stw_words_orig["Word"].str.contains(r'\b{}\b'.format(re.escape(token_lemmata[i]))))] has_prep_noun_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.pos_ in ['ADP', 'PROPN', 'NOUN'] and child.dep_.startswith('o')]) > 0]) > 0) has_claus_inf_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.dep_ == 'oc']) > 0]) > 0) transformed.append(has_prep_noun_comp) transformed.append(has_claus_inf_comp) # --- Possible speaker features --- # Is subject a pronoun, a person NE or a "Person" head noun -> possible speaker cand_speakers = [tokens[i] for i,tag in enumerate(tags) if (tag in['PPER', 'PIS', 'PDS'] or (tag in ['NE', 'NNE'] and 'PER' in [ent for ent in doc.ents if tokens[i].idx >= ent.start and tokens[i].idx <= ent.end]))] # Check whether any noun phrase has a head that is a synset of "Person" in Germanet person = [] with open('data/person.txt', 'r', encoding='utf-8') as f: for l in f: person.append(l) for np in doc.noun_chunks: if np.root.text in person: cand_speakers.append(np.root) subj_cand_speaker = [token for token in cand_speakers if token.dep_ == 'sb'] # How many possible speakers/addressees are there in relation to the segment length? num_cand_speaker = len(cand_speakers)/len(tokens) transformed.append(int(len(subj_cand_speaker) > 0)) transformed.append(num_cand_speaker) # Append prev. segments candidate speaker features transformed.append(backlog[38]) transformed.append(backlog[39]) # --- Reporting word features --- # Appearance of reporting word by penalty has_rep_word_0 = int(any([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_1 = int(any([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_2 = int(any([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_3 = int(any([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_4 = int(any([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_5 = int(any([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of reporting word lower or equal a certain penalty has_rep_word_le_1 = int(any([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_2 = int(any([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_3 = int(any([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_4 = int(any([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_5 = int(any([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect' has_rep_word_noun = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata])) has_rep_word_verb = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata])) for feature in [has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5, has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5, has_rep_word_noun, has_rep_word_verb]: transformed.append(feature) # Appearance of special reporting words for reported class by penalty has_spec_rep_word_0 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of special reporting words lower or equal a certain penalty has_spec_rep_word_le_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) for feature in [has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4, has_spec_rep_word_5, has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4, has_spec_rep_word_le_5]: transformed.append(feature) # Number of reporting word by penalty num_rep_word_0 = sum([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_1 = sum([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_2 = sum([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_3 = sum([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_4 = sum([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_5 = sum([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of reporting word lower or equal a certain penalty num_rep_word_le_1 = sum([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_2 = sum([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_3 = sum([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_4 = sum([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_5 = sum([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect' num_rep_word_noun = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata]) num_rep_word_verb = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata]) for feature in [num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4, num_rep_word_5, num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4, num_rep_word_le_5, num_rep_word_noun, num_rep_word_verb]: transformed.append(feature) # Number of special reporting words for reported class by penalty num_spec_rep_word_0 = sum([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_1 = sum([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_2 = sum([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_3 = sum([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_4 = sum([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_5 = sum([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of special reporting words lower or equal a certain penalty num_spec_rep_word_le_1 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_2 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_3 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_4 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_5 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) for feature in [num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3, num_spec_rep_word_4, num_spec_rep_word_5, num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3, num_spec_rep_word_le_4, num_spec_rep_word_le_5]: transformed.append(feature) # Reporting word features prev. segment for feature in backlog[14:38]: transformed.append(feature) for feature in backlog[50:74]: transformed.append(feature) # Word vectors # Get prototypical word vector for reporting words proto_rep_vec = numpy.average([self.wordvecs[word] for word in stw_words[stw_words["Penalty"] == 0] if word in self.wordvecs], axis=0) # Get prototypical word vector for reported class proto_rep_vec_reporting = numpy.average([self.wordvecs[word] for word in stw_words_rep[stw_words_rep["Penalty"] == 0] if word in self.wordvecs], axis=0) # Append highest similarity values to proto word vectors within the segment max_sim = .0 max_sim_rep = .0 for lemma in token_lemmata: if lemma in self.wordvecs: lemma_vec = self.wordvecs[lemma] # cosine similarity = 1 - cosine distance sim = 1 - distance.cosine(lemma_vec, proto_rep_vec) sim_rep = 1 - distance.cosine(lemma_vec, proto_rep_vec_reporting) if sim > max_sim: max_sim = sim if sim_rep > max_sim_rep: max_sim_rep = sim_rep transformed.append(max_sim) transformed.append(max_sim_rep) # --- Other word features --- # Usage of deictic words can point to character speech - precentage of deictic words transformed.append(len([t for t in token_strings if t in DEICTIC])/len(token_strings)) # Usage of special conjunction at the beginning of the segment can point to indirect transformed.append(int(token_strings[0] in CONJUNCT)) # Usage of modal particles can point towards character speech transformed.append(len([t for t in token_strings if t in MODAL_PART])/len(token_strings)) # Negation? transformed.append(len([lemma for lemma in token_lemmata if lemma in NEG])/len(token_strings)) # Words describing facial expressions, gestures, voice might hint towards STWR transformed.append(int(len([lemma for lemma in token_lemmata if lemma in FACIAL]) > 0)) transformed.append(int(len([lemma for lemma in token_lemmata if lemma in GESTURE]) > 0)) transformed.append(int(len([lemma for lemma in token_lemmata if lemma in VOICE]) > 0)) # The repetition of words can hint towards figural speech transformed.append(int(any([count >= 2 for count in [token_lemmata.count(el) for el in token_lemmata]]))) # --- Sequential features --- if self.sequence_features: # Labels of prev. segment labels_last = [l for i,l in enumerate(backlog[9].split(",")) if i%3==0] transformed.append(int(any([l.startswith('direct') for l in labels_last]))) transformed.append(int(any([l.startswith('indirect') for l in labels_last]))) transformed.append(int(any([l.startswith('free_indirect') for l in labels_last]))) transformed.append(int(any([l.startswith('reported') for l in labels_last]))) # Label appears in 5 prev. segments labels_last_5 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[5:10]] for fin_l in ls] transformed.append(int(any([l.startswith('direct') for l in labels_last_5]))) transformed.append(int(any([l.startswith('indirect') for l in labels_last_5]))) transformed.append(int(any([l.startswith('free_indirect') for l in labels_last_5]))) transformed.append(int(any([l.startswith('reported') for l in labels_last_5]))) # How many labels for each class and overall within the last 10 segments labels_last_10 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[0:10]] for fin_l in ls if fin_l != ""] transformed.append(len([l for l in labels_last_10 if l.startswith('direct')])) transformed.append(len([l for l in labels_last_10 if l.startswith('indirect')])) transformed.append(len([l for l in labels_last_10 if l.startswith('free_indirect')])) transformed.append(len([l for l in labels_last_10 if l.startswith('reported')])) transformed.append(len(labels_last_10)) # --- Other features --- # Segment and character lengths transformed.append(len(token_strings)) transformed.append(len(original_text)) # Segment and character lengths of prev. segment transformed.append(backlog[40]) transformed.append(backlog[41]) # Segment and character lengths of this + prev. segment transformed.append(len(token_strings) + backlog[40]) transformed.append(len(original_text) + backlog[41]) # Is this segment at the start or end of a paragraph? paragraph_end = int("<p>" in original_text) transformed.append(paragraph_end) transformed.append(backlog[42]) # --- Update Backlog --- # [0:10] encode labels of previous ten segments -> updated elsewhere # 10: Colon in prev. segment backlog[10] = colon_this # 11: How many open quotes backlog[11] += open_quote if backlog[11] - close_quote >= 0: backlog[11] -= close_quote else: backlog[11] = 0 # 12: Prev. segment ends with close_quote backlog[12] = int(tags[-1] == "#CLOSE_QUOTE#") # 13: Comma at the end of this segment backlog[13] = comma_end # [14:38] reportin word appearance features prev. segment for i, feature in enumerate([has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5, has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5, has_rep_word_noun, has_rep_word_verb, has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4, has_spec_rep_word_5, has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4, has_spec_rep_word_le_5 ]): backlog[14 + i] = feature # 38: Candidate speakers as subject backlog[38] = int(len(subj_cand_speaker) > 0) # 39: Percentage of candidate speakers backlog[39] = num_cand_speaker # 40, 41: lengths of prev. segment backlog[40] = len(token_strings) backlog[41] = len(original_text) # 42: paragraph end backlog[42] = paragraph_end # [43:48]: keep track of pronoun person appearances in the 5 prev. segments backlog[43:47] = backlog[44:48] if per3: if per1: backlog[48] = '3_1' else: backlog[48] = '3' elif per1: backlog[48] = '1' else: backlog[48] = '-' # 49: How many contiguous prev. segments have been in quotes? if in_quotes: backlog[49] += 1 else: backlog[49] = 0 # [50:74] reportin word count features prev. segment for i, feature in enumerate([num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4, num_rep_word_5, num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4, num_rep_word_le_5, num_rep_word_noun, num_rep_word_verb, num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3, num_spec_rep_word_4, num_spec_rep_word_5, num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3, num_spec_rep_word_le_4, num_spec_rep_word_le_5 ]): backlog[50 + i] = feature return transformed, backlog
def postprocess_spans(row, cl=None): """ Method for better span detection as a postprocessing step after STWR classification. :param row: Each row consists of a label (format:"direct_speech,2,10") and a text. :param cl: label of the positive class instances. :return: The updated label """ label = row.values[0] # Only do postprocessing for detected instances if label == "": return label text = row.values[1] doc = NLP(text) tokens = [token for token in doc] # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV token_lemmata = [] lemmatizer = GermaLemma() for token in tokens: if token.pos_ == "VERB": token_lemmata.append(lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": token_lemmata.append(lemmatizer.find_lemma(token.text, 'N')) elif token.pos_ in ["ADJ", "ADV"]: token_lemmata.append(lemmatizer.find_lemma(token.text, token.pos_)) else: token_lemmata.append(token.text) # Prepare information only_opening_quotes = [ qu for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu] ] only_closing_quotes = [ QUOTATION_MARKS[qu] for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu] ] # Do not treat apostrophes as possible quotation marks -> too risky both_quotes = [ qu for qu in QUOTATION_MARKS.keys() if qu == QUOTATION_MARKS[qu] and qu != '\u0027' ] # Find quotation marks that can either be an opening or a closing quote but that don't have the same form as their counter part both = [qu for qu in only_opening_quotes if qu in only_closing_quotes] only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both] only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both] both_quotes = both_quotes + both # Load reporting word list stw_words_all = pd.read_excel("data/stw_words/stw_words_brunner2015.xls") # Only use words with penalty value up tp 3 stw_words_all = stw_words_all[stw_words_all['Penalty'] <= 3] # Some words are only usable for reported class stw_words = stw_words_all[stw_words_all['Marker'] != 'rep'] spans = [] if cl == 'direct': # Search for quotation marks and try to decide whether they signify quoted STWR. Use conservative heuristics. for token in tokens: # Mark different candidates for quotation marks if token.text in only_opening_quotes: token.tag_ = "ONLY_OPENING_QUOTE" elif token.text in only_closing_quotes: token.tag_ = "ONLY_CLOSING_QUOTE" elif token.text in both_quotes: token.tag_ = "BOTH_QUOTES" stack = [] for idx, token in enumerate(tokens): if token.tag_ == "ONLY_OPENING_QUOTE": stack.append((idx, token.text, token.tag_)) elif token.tag_ in ["ONLY_CLOSING_QUOTE", "BOTH_QUOTES"]: # Check whether there is a matching opening quote on the stack found = False for i in range(len(stack) - 1, -1, -1): top = stack[i] if QUOTATION_MARKS[top[1]] == token.text: found = True # Closing quotes are usually preceded by sentence ending punctuation if tokens[idx - 1].tag_ == '$.': spans.append((top[0], idx)) stack = stack[:i] break if not found: # If no opening quotes were found and clear closing quotes are preceded by sentence ending punctuation, # assume everything before is quoted if token.tag_ == "ONLY_CLOSING_QUOTE" and idx > 0 and tokens[ idx - 1].tag_ == '$.': spans.append((0, idx)) # If ambiguous quotation mark is found, decide whether it's opening or closing elif token.tag_ == "BOTH_QUOTES": if idx > 0 and tokens[idx - 1].tag_ == '$.': spans.append((0, idx)) else: stack.append((idx, token.text, token.tag_)) # Check for open quotes in the stack if len(stack) > 0: # Choose first open quote in stack # Opening quotes are usually followed by capital letters (except continuing quotations, these are ignored here) opening = stack[0] if opening[0] < len(tokens) - 2: if tokens[opening[0] + 1].text.istitle(): spans.append((opening[0], len(tokens) - 1)) # In case no quotation marks are there, look for colon if len(spans) == 0: for idx, token in enumerate(tokens): if ":" == token.text: spans.append((idx, len(tokens) - 1)) elif cl == 'indirect': # Following A.B.s directions for annotating indirect representations # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) # Pattern 1: verbal framing phrase + dependent clause - assume max. one of these patterns per segment stw_verb_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if not lemma.istitle() and any(stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))) ] # Only use this pattern if there is a clear candidate if len(stw_verb_segment) == 1: verb = stw_verb_segment[0] dependent_clause = get_children(verb, exception=['sb']) start = None end = None for i, token in enumerate(tokens): if token == verb: start = i elif token in dependent_clause: if start != None: end = i if start != None and end != None: spans.append((start, end)) # Pattern 2: nominal phrase includ. modificators + dependent clause - several of these patterns per segment are possible stw_noun_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if lemma.istitle() and any(stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))) ] for noun in stw_noun_segment: dependent_clause_modif = get_children(noun, exception=[]) all_tokens = dependent_clause_modif + [noun] start = None end = None for i, token in enumerate(tokens): if token in all_tokens: if start == None: start = i else: end = i if start != None and end != None: spans.append((start, end)) # Merge spans merged_spans = [] if len(spans) > 1: for i, span in enumerate(spans): for other in spans: if other == span: continue else: if span[0] >= other[0] and span[1] <= other[1]: break else: merged_spans.append(span) spans = merged_spans elif cl == 'free_indirect': # Free indirect instances are almost always complete sentences -> leave as is pass elif cl == 'reported': # „Prinzipiell wird bei erzählter Wiedergabe angestrebt, den ganzen Satz oder Satzteil zu markieren, der eine Sprach-, Denk- oder Schreibhandlung wiedergibt. # – Wenn es möglich ist, mehrere unterschiedliche sprachliche, schriftliche oder gedankliche Handlungen zu identifizieren, so werden diese jeweils einzeln markiert. # – Wenn eine Nominalphrase mit einem Verb verwendet wird, so dass sich im Ganzen eine Sprach-, Denk- oder Schreibhandlung ergibt, # sollte – wie bei indirekter Wiedergabe – die ganze Verbalphrase markiert werden (also Pläne entwerfen, nicht nur Pläne).“ # Following A.B.s directions for annotating reported representations try to annotate the whole clause for reported instances # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) stw_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if any(stw_words_all["Word"].str.contains(r'\b{}\b'.format( re.escape(lemma)))) ] for word in stw_segment: dependent_clause = get_children(word, exception=[]) all_tokens = dependent_clause + [word] start = None end = None for i, token in enumerate(tokens): if token in all_tokens: if start == None: start = i else: end = i if start != None and end != None: spans.append((start, end)) # Don't merge spans as several different reported instance should be labeled separately following A.B.s directions for annotating reported representations # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) # Get character based spans if len(spans) > 0: labels = [] for span in spans: labels.append("{},{},{}".format( cl, tokens[span[0]].idx, (tokens[span[1]].idx + len(tokens[span[1]].text)))) label = ",".join(labels) return label
ci_upper = match_mean + 1.96 * match_se ci_lower = match_mean - 1.96 * match_se return match_mean * 100, ci_lower * 100, ci_upper * 100 print("loading data...") eval_df = pd.read_csv('eval_table/eval_table_lemmata.csv') eval_df = eval_df.loc[~eval_df.lemma.isna(), :] print('loaded %d rows' % len(eval_df)) lemmatizer = GermaLemma() eval_df['germalemma'] = eval_df.apply(lambda row: lemmatizer.find_lemma(row[3], row[2]), axis=1) eval_df['match'] = eval_df.lemma == eval_df.germalemma eval_df.head() print('wrong lemmata:') print(eval_df.loc[~eval_df.match, ['token', 'pos', 'lemma', 'germalemma']]) match_mean, ci_lower, ci_upper = get_mean_and_ci(eval_df.match) print('Success rate for germalemma: %.2f%% (95%% CI: [%.2f%%, %.2f%%])' % (match_mean, ci_lower, ci_upper)) eval_df['pattern'] = eval_df.apply(lambda row: lemma_via_patternlib(row[3], row[2]), axis=1) eval_df['match_pattern'] = eval_df.lemma == eval_df.pattern eval_df.head()
df_articles['Nouns'] = df_articles['Nouns'].apply( lambda x: [word for word in x if len(x) > 1]) # df_articles['Nounverbs'] = df_articles['Nounverbs'].apply(lambda x: [word for word in x if len(x)>1]) # Lemmatization lemmatizer = GermaLemma() # Lemmatization of Nouns noun_list = df_articles['Nouns'].tolist() global noun_lemma_list noun_lemma_list = [] for doc in noun_list: noun_lemma_list.append([]) for token in doc: token_lemma = lemmatizer.find_lemma(token.text, token.tag_) token_lemma = token_lemma.lower() noun_lemma_list[-1].append(token_lemma) # Save to help df df_help_noun_lemma_list = pandas.DataFrame({'x': noun_lemma_list}) # Create id increasing (needed to merge to original df) df_help_noun_lemma_list.insert(0, 'ID_incr', range(1, 1 + len(df_help_noun_lemma_list))) # Merge df_help_noun_lemma_list to df_help_noun_lemma_list and rename df_articles = (df_articles.merge( df_help_noun_lemma_list, left_on='ID_incr', right_on='ID_incr')).rename(columns={'x': 'Nouns_lemma'})
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- from germalemma import GermaLemma import pickle from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger lemmatizer = GermaLemma() # passing the word and the POS tag ("N" for noun) with open('data/pos.pickle', 'rb') as f: tagger = pickle.load(f) pos = tagger.tag(['Jungen', u'Wände', u'Wänden']) print(pos) for item in pos: w, p = item print(lemmatizer.find_lemma(w, p)) #lemma = lemmatizer.find_lemma(u'Jungen', u'N') #print(lemma)
# build lemmatizer with tokens_a lemmata = defaultdict(dict) lemmata_lower = defaultdict(dict) for token, lemma, pos in tokens_a: GermaLemma.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos) lemmatizer = GermaLemma(lemmata=lemmata, lemmata_lower=lemmata_lower) # test lemmatizer with tokens_b n_success = 0 for token, true_lemma, pos in tokens_b: found_lemma = lemmatizer.find_lemma(token, pos) if found_lemma == true_lemma: n_success += 1 elif found_lemma != token and token not in known_incorrect_lemmata_tokens: incorrect_lemmata.append((token, found_lemma, true_lemma)) known_incorrect_lemmata_tokens |= {token} n_all = len(tokens_b) pct_success = n_success / n_all * 100 print('%d / %d = %.2f%%' % (n_success, n_all, pct_success)) pct_success_all_trials.append(pct_success) print('') print('success rate germalemma:') print('%.2f%%' % (sum(pct_success_all_trials) / len(pct_success_all_trials)))
def comment_to_topic(comment): # load and define stuff lemmatizer = GermaLemma() lemmas = [] remove = [ line.rstrip('\n') for line in open('reviews/add-stopwords.txt', encoding="utf-8") ] stop = stopwords.words('german') exclude_words = remove + stop exclude = { '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' } with open('reviews/nltk_german_classifier_data.pickle', 'rb') as f: tagger = pickle.load(f) # sentence splitting comment = nltk.sent_tokenize(comment) lemmas = [] for j in range(len(comment)): # tokenization comment[j] = nltk.word_tokenize(comment[j]) # punctuation removal comment[j] = [ token for token in comment[j] if token not in exclude and token.isalpha() ] # POS taging comment[j] = tagger.tag(comment[j]) # lemmatization for k in range(len(comment[j])): try: lemmas.append( lemmatizer.find_lemma(comment[j][k][0], comment[j][k][1])) except ValueError: pass # lower lemmas = [word.lower() for word in lemmas] # stopword removal topics = [word for word in lemmas if word not in exclude_words] # make topics html-safe topics_safe = [ t.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss') for t in topics ] return topics, topics_safe
class SentimentDetector: def __init__(self, path: str = "src/data/", windowSize=5) -> None: self.path = path self.windowSize = windowSize self.df_aspect_tokens = None self.df_preprocessed = None self.df_lexicon = None self.lemmatizer = GermaLemma() def downloadLexicon( self, filename: str = "sentiment_lexicon.csv", url: str = "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv", chunk_size: int = 1024, ) -> None: """ Download sentiment lexicon. Args: filename (str, optional): Defaults to "sentimentLexicon.csv". url (str, optional): Defaults to "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv". chunk_size (int, optional): Defines chunk size for downloads of bigger files. Defaults to 128. """ r = requests.get(url, stream=True) file_size = int(r.headers.get("Content-Length", None)) num_bars = NP.ceil(file_size / (chunk_size)) downloadProgress = tqdm(total=num_bars, desc="Downloading Lexicon...", unit="B", unit_scale=True) with open(self.path + filename, "wb") as fd: for chunk in r.iter_content(chunk_size=chunk_size): downloadProgress.update(len(chunk)) fd.write(chunk) downloadProgress.close() def loadCSVs( self, tokenFilename: str = "data_aspects_tokens.csv", preprocessedFilename: str = "data_preprocessed.csv", lexiconFilename: str = "sentiment_lexicon.csv", ) -> bool: """ load all necessary CSV for execution of the detector and set indices as appropriate Args: tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv". preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv". lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv". Returns: bool: successful execution """ try: if self.df_aspect_tokens is None or self.df_aspect_tokens.empty: self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename) self.df_aspect_tokens["polarity_strength"] = PD.NaT self.df_aspect_tokens["polarity_strength"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["sentiment_words"] = PD.NaT self.df_aspect_tokens["sentiment_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["intensifier_words"] = PD.NaT self.df_aspect_tokens["intensifier_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[ "word_found"].str.replace(r"[^\w]*", "", regex=True) # TODO remove after debugging # self.df_aspect_tokens = self.df_aspect_tokens[:100] if self.df_preprocessed is None or self.df_preprocessed.empty: self.df_preprocessed = PD.read_csv(self.path + preprocessedFilename) # pandas read_csv does not read arrays correctly so we need to adjust those tqdm.pandas(desc="Applying Datatype Transformations....") self.df_preprocessed["tokens"] = self.df_preprocessed[ "tokens"].progress_apply(lambda x: json.loads(x)) if self.df_lexicon is None or self.df_lexicon.empty: if not os.path.exists(self.path + lexiconFilename): self.downloadLexicon() self.df_lexicon = PD.read_csv(self.path + lexiconFilename) self.df_lexicon.drop_duplicates(subset=["word", "qualifier"], inplace=True) self.df_lexicon.set_index("word", inplace=True) self.df_lexicon.drop("%%") return True except IOError as e: print(e) return False def loadSpacyModel( self, model: str = "de_core_news_lg", disableList: list[str] = ["ner", "textcat"], ) -> bool: """ load the spacy model with required modes Args: model (str, optional): name of the mode. Defaults to "de_core_news_sm". disableList (list[str], optional): list of things to be disabled. Defaults to ["tagger", "parser", "ner"]. """ try: self.nlp = spacy.load(model, disable=disableList) return True except OSError: print("Model not found. Attempting to download..") try: spacy.cli.download(model) except Exception as e: print(e) return False self.nlp = spacy.load(model, disable=disableList) return True def checkValidChild(self, child, childType: ChildType) -> bool: if childType == ChildType.DESCRIPTOR: if (child.tag_ == "ADJA" and child.pos_ == "ADJ") or (child.pos_ == "ADV" and child.tag_ == "ADJD"): return True return False elif childType == ChildType.INTENSIFIER: if child.pos_ == "ADJ" or child.pos_ == "ADV": return True return False else: print("Wrong childType.") return False def checkPolarityAdjective(self, child, rowIdx) -> float: """ check if the given word has an entry in the sentiment lexicon and return given polarity strength Args: child (spacy.Token): tokenized word with tagged 'pos_' and 'text' Returns: pol_strength (float): polarity_strength of given word found in sentiment lexicon """ child_normalized = child.text.replace(r"[^\w]*", "") lexEntry = self.checkLexicon(child_normalized) if lexEntry is None: lexEntry = self.checkLexicon(child_normalized.lower()) if lexEntry is None: lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_) lexEntry = self.checkLexicon(lemma) if lexEntry is None: return 1 if type(lexEntry["qualifier"]) == str: pol_strength = lexEntry["polarity_strength"] if lexEntry["qualifier"] == "NEG": return -pol_strength return pol_strength else: for i, qualifier in enumerate(lexEntry["qualifier"].values): if qualifier == "POS": return lexEntry["polarity_strength"][i] if qualifier == "NEG": return -lexEntry["polarity_strength"][i] return 0 def checkLexicon(self, word) -> PD.Series: """ Check for valid lexicon entries return None if not found Args: word (str): word to be use as key Returns: PD.Series: Series that is found for the given key or None """ try: return self.df_lexicon.loc[word] except KeyError: return None def checkForIntensifier(self, child, rowIdx) -> float: """ For a given spacy.Token (child) check if any of the children is an intensifier and if so, return their polarity_strength Args: child (spacy.Token): tokenized word with tagged 'pos_' and 'text' Returns: polarity_multiplier (float): polarity_multiplier of found intensifier word """ child_normalized = child.text.replace(r"[^\w]*", "") # catch words that are not in the sentiment lexicon lexEntry = self.checkLexicon(child_normalized) if lexEntry is None: lexEntry = self.checkLexicon(child_normalized.lower()) if lexEntry is None: lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_) lexEntry = self.checkLexicon(lemma) if lexEntry is None: return 1 if type(lexEntry["qualifier"]) == str: if lexEntry["qualifier"] == "INT": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return lexEntry["polarity_strength"] elif lexEntry["qualifier"] == "SHI": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return -1 else: return 1 else: for i, qualifier in enumerate(lexEntry["qualifier"].values): # TODO currently the first qualifier found is taken, without considering which the most fitting one is if qualifier == "INT": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return lexEntry["polarity_strength"][i] elif qualifier == "SHI": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return -1 return 1 def calcTotalPolarityStrength(self, child, rowIdx) -> float: """ Calculate the total polarity for a given word Args: child (spacy.Token): the tokenized word with tagged 'pos_' and 'text' Returns: polarity_strength (float): the calculated polarity for the given word (child) """ # lemma = self.lemmatizer.find_lemma(child.text, child.pos_) polarity_strength = self.checkPolarityAdjective(child, rowIdx) # find intensifier in children and multiply their strength to the polarity for c in child.children: if self.checkValidChild(c, ChildType.INTENSIFIER): polarity_strength *= self.checkForIntensifier(c, rowIdx) return polarity_strength def detectSentiment(self, rowDF: PD.Series) -> None: """ Function to start the other relevent functions Args: rowDF (PD.Series): row of the Dataframe """ doc = self.nlp(" ".join(self.df_preprocessed.iloc[ rowDF["reviewnumber"]]["tokens"][rowDF["sent_idx"]])) for child in doc[rowDF["word_idx"]].children: # if child.tag_ == "ADJA": if self.checkValidChild(child, ChildType.DESCRIPTOR): pol_strength = self.calcTotalPolarityStrength( child, rowDF.name) self.df_aspect_tokens["polarity_strength"][rowDF.name].append( pol_strength) self.df_aspect_tokens["sentiment_words"][rowDF.name].append( child.text) return for token in doc[rowDF["word_idx"]].ancestors: if token.pos_ == "AUX" or token.pos_ == "VERB": for child in token.children: if self.checkValidChild(child, ChildType.DESCRIPTOR): pol_strength = self.calcTotalPolarityStrength( child, rowDF.name) self.df_aspect_tokens["polarity_strength"][ rowDF.name].append(pol_strength) self.df_aspect_tokens["sentiment_words"][ rowDF.name].append(child.text) return def convert_polarity(self, qualifier, polarity): sentiment_polarity = [] for i, elem in enumerate(qualifier): if elem == "NEG": sentiment_polarity.append(polarity[i] * -1) else: sentiment_polarity.append(polarity[i]) sentiment_polarity = NP.mean(NP.array(sentiment_polarity)) return sentiment_polarity def createReadableOutput(self, rowDF): appenddict = { "review_number": rowDF["reviewnumber"], "sentiment": self.convert_polarity(rowDF["qualifier"], rowDF["polarity_strength"]), } self.overall_sentiment = self.overall_sentiment.append( appenddict, ignore_index=True) def returnSentimentsforReviews(self) -> PD.DataFrame: self.overall_sentiment = PD.DataFrame( columns=["review_text", "sentiment"]) tqdm.pandas(desc="Calculating Sentiments") self.df_aspect_tokens.progress_apply( lambda x: self.createReadableOutput(x), axis=1) self.overall_sentiment = (self.overall_sentiment.groupby( "review_number").mean().reset_index()) # print(self.overall_sentiment) self.overall_sentiment["review_text"] = self.df_preprocessed[ "text_normalized"][self.overall_sentiment["review_number"].astype( int).tolist()].tolist() return self.overall_sentiment def run(self) -> bool: """ run all basic functions of the detector Returns: bool: successful execution of command """ if not self.loadCSVs(): print("Couldn't load CSV's.") return False if not self.loadSpacyModel(): return true_labels = list() for index, row in self.df_aspect_tokens.iterrows(): true_labels.append(self.df_preprocessed.iloc[row["reviewnumber"]][ self.df_aspect_tokens.iloc[index]["aspect"]]) self.df_aspect_tokens["true_label"] = true_labels tqdm.pandas(desc="Looking up Sentiments...") self.df_aspect_tokens.progress_apply(lambda x: self.detectSentiment(x), axis=1) def saveCSV(self, filename: str = "data_aspects_tokens.csv"): self.df_aspect_tokens["sentiment_words"] = self.df_aspect_tokens[ "sentiment_words"].apply(lambda x: json.dumps(x)) self.df_aspect_tokens.to_csv(self.path + filename, index=False)
def main(): # train if os.path.exists('./resources/nltk_german_classifier_data.pickle'): with open('./resources/nltk_german_classifier_data.pickle', 'rb') as f: print('./resources/nltk_german_classifier_data.pickle found') tagger = pickle.load(f) else: print( 'could not find ./resources/nltk_german_classifier_data.pickle: training: IN PROGRESS' ) tagger = train() with open('./resources/nltk_german_classifier_data.pickle', 'wb') as f: pickle.dump(tagger, f, protocol=2) print('training FINISHED') # tokenize if os.path.exists('./data/1.pickle'): with open('./data/1.pickle', 'rb') as f: print('1.pickle found') words = pickle.load(f) else: print('could not find 1.pickle: tokenizing: IN PROGRESS') document = open('./resources/logik-band-eins.txt').read() tok = Tokenizer() tokens = tok.tokenize(document) words = [] i = 0 for token in tokens: if i < 10000: v = token.value if len(v) > 1 and (not str.isdigit(v)) or True: words.append(v) # i = i + 1 else: break with open('./data/1.pickle', 'wb') as f: pickle.dump(words, f, protocol=2) print('tokenizing FINISHED') # tag if os.path.exists('./data/2.pickle'): with open('./data/2.pickle', 'rb') as f: print('2.pickle found') tagged_words = pickle.load(f) else: print('could not find 2.pickle: tagging: IN PROGRESS') tagged_words = tagger.tag(words) with open('./data/2.pickle', 'wb') as f: pickle.dump(tagged_words, f, protocol=2) # filter-in As, Ns, and Vs if os.path.exists('./data/3.pickle'): with open('./data/3.pickle', 'rb') as f: print('3.pickle found') filtered_words = pickle.load(f) else: print('could not find 3.pickle: filtering: IN PROGRESS') parts_of_speech = [ 'ADJA', 'ADJD', 'NN', 'NN', ] filtered_words = list( filter( lambda word: word[1][0] == 'V' or any( pos == word[1] for pos in parts_of_speech), tagged_words)) with open('./data/3.pickle', 'wb') as f: pickle.dump(filtered_words, f, protocol=2) # lemmatize if os.path.exists('./data/4.pickle'): with open('./data/4.pickle', 'rb') as f: print('4.pickle found') lemmatized_words = pickle.load(f) else: print('could not find 4.pickle: lematization: IN PROGRESS') lemmatizer = GermaLemma() lemmatized_words = [] for word in filtered_words: try: lemmatized_words.append(lemmatizer.find_lemma( word[0], word[1])) except: w = word[0] l = word[1] print(f"EXCEPT: {w} {l}") continue with open('./data/4.pickle', 'wb') as f: pickle.dump(lemmatized_words, f, protocol=2) # filter-out modals f = open('./resources/modal-words.txt', 'r') modal_words = f.read().splitlines()[:1000] non_modals = [item for item in lemmatized_words if item not in modal_words] # non_modals = list(filter(lambda word: not any(modal == word for modal in modals), lemmatized_words)) # modals = [] # line = f.readline() # modals.append(line) # while line: # line = f.readline() # modals.append(line) for pair in Counter(non_modals).most_common(30): print(pair[0] + " " + str(pair[1]))
class _PreprocWorker(mp.Process): def __init__(self, worker_id, docs, language, tasks_queue, results_queue, tokenizer, stemmer, lemmata_dict, pos_tagger, group=None, target=None, name=None, args=(), kwargs=None): super(_PreprocWorker, self).__init__(group, target, name, args, kwargs or {}) logger.debug('worker `%s`: init with worker ID %d' % (name, worker_id)) logger.debug('worker `%s`: docs = %s' % (name, str(set(docs.keys())))) self.worker_id = worker_id self.docs = docs self.language = language self.tasks_queue = tasks_queue self.results_queue = results_queue # set a tokenizer self.tokenizer = tokenizer # tokenizer instance (must have a callable attribute `tokenize` with a document # text as argument) # set a stemmer self.stemmer = stemmer # stemmer instance (must have a callable attribute `stem`) # set a POS tagger self.pos_tagger = pos_tagger # POS tagger instance (must have a callable attribute `tag`) self.lemmata_dict = lemmata_dict self.pattern_module = None # dynamically loaded CLiPS pattern library module self.germalemma = None # GermaLemma instance self.wordnet_lemmatizer = None # nltk.stem.WordNetLemmatizer instance self._tokens = { } # tokens for this worker at the current processing stage. dict with document label -> tokens list self._ngrams = {} # generated ngrams #self._filtered = False self._orig_tokens = None # original (unfiltered) tokens, when filtering is currently applied def run(self): logger.debug('worker `%s`: run' % self.name) for next_task, task_kwargs in iter(self.tasks_queue.get, None): logger.debug('worker `%s`: received task `%s`' % (self.name, next_task)) exec_task_fn = getattr(self, '_task_' + next_task) if exec_task_fn: exec_task_fn(**task_kwargs) else: raise NotImplementedError("Task not implemented: `%s`" % next_task) self.tasks_queue.task_done() logger.debug('worker `%s`: shutting down' % self.name) self.tasks_queue.task_done() def _put_items_in_results_queue(self, container): if container: logger.debug('worker `%s`: putting %d results in queue' % (self.name, len(container))) for pair in container.items(): self.results_queue.put(pair) else: # we *have* to put something in the result queue -> signal that we return "nothing" logger.debug('worker `%s`: putting None in results queue' % self.name) self.results_queue.put(None) def _task_get_tokens(self): self._put_items_in_results_queue(self._tokens) def _task_get_tokens_with_worker_id(self): self.results_queue.put((self.worker_id, self._tokens)) def _task_get_ngrams(self): self._put_items_in_results_queue(self._ngrams) def _task_get_ngrams_with_worker_id(self): self.results_queue.put((self.worker_id, self._ngrams)) def _task_get_vocab_doc_freq(self): counts = Counter() for dt in self._tokens.values(): counts.update(set(ith_column(dt))) self.results_queue.put(counts) def _task_get_state(self): logger.debug('worker `%s`: getting state' % self.name) state_attrs = ('docs', 'language', '_tokens', '_ngrams', '_orig_tokens') state = {attr: getattr(self, attr) for attr in state_attrs} logger.debug('worker `%s`: got state with %d items' % (self.name, len(state))) self.results_queue.put(state) def _task_set_tokens(self, tokens): logger.debug('worker `%s`: setting tokens' % self.name) self._tokens = tokens def _task_set_ngrams(self, ngrams): logger.debug('worker `%s`: setting ngrams' % self.name) self._ngrams = ngrams def _task_set_state(self, **state): logger.debug('worker `%s`: setting state' % self.name) for attr, val in state.items(): setattr(self, attr, val) def _task_tokenize(self): self._tokens = { dl: tuplize(self.tokenizer.tokenize(txt)) for dl, txt in self.docs.items() } def _task_generate_ngrams(self, n, join=True, join_str=' '): self._ngrams = { dl: create_ngrams(ith_column(dt), n=n, join=join, join_str=join_str) for dl, dt in self._tokens.items() } def _task_use_ngrams_as_tokens(self, join=False, join_str=' '): if join: new_tok = { dl: tuplize([join_str.join(g_tuple) for g_tuple in dg]) for dl, dg in self._ngrams.items() } else: new_tok = {dl: tuplize(dg) for dl, dg in self._ngrams.items()} self._tokens = new_tok def _task_transform_tokens(self, transform_fn): self._tokens = { dl: apply_to_mat_column(dt, 0, transform_fn) if dt else [] for dl, dt in self._tokens.items() } def _task_stem(self): self._tokens = { dl: apply_to_mat_column(dt, 0, lambda t: self.stemmer.stem(t)) if dt else [] for dl, dt in self._tokens.items() } def _task_pos_tag(self): self._tokens = { dl: apply_to_mat_column( dt, 0, self.pos_tagger.tag, map_func=False, expand=True) if dt else [] for dl, dt in self._tokens.items() } def _task_lemmatize(self, pos_tagset, use_dict=False, use_patternlib=False, use_germalemma=None): tmp_lemmata = defaultdict(list) if use_germalemma is None and self.language == 'german': use_germalemma = True if use_germalemma: if not self.germalemma: self.germalemma = GermaLemma() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: try: l = self.germalemma.find_lemma(t, pos) except ValueError: l = t tmp_lemmata[dl].append(l) else: if use_dict and self.lemmata_dict: for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: pos = simplified_pos(pos, tagset=pos_tagset) if pos: l = self.lemmata_dict.get(pos, {}).get(t, None) if l == '-' or l == '': l = None else: l = None tmp_lemmata[dl].append(l) if use_patternlib: if not self.pattern_module: if self.language not in PATTERN_SUBMODULES: raise ValueError( "no CLiPS pattern module for this language:", self.language) modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language] self.pattern_module = import_module(modname) for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) lemmata_final = [] for (t, pos), t_found in zip(tok_tags, tok_lemmata): l = t_found if l is None: if pos.startswith('NP'): # singularize noun l = self.pattern_module.singularize(t) elif pos.startswith('V'): # get infinitive of verb l = self.pattern_module.conjugate( t, self.pattern_module.INFINITIVE) elif pos.startswith('ADJ') or pos.startswith( 'ADV' ): # get baseform of adjective or adverb l = self.pattern_module.predicative(t) lemmata_final.append(l) tmp_lemmata[dl] = lemmata_final if len(tmp_lemmata) == 0: if not self.wordnet_lemmatizer: self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: wn_pos = pos_tag_convert_penn_to_wn(pos) if wn_pos: l = self.wordnet_lemmatizer.lemmatize(t, wn_pos) else: l = t tmp_lemmata[dl].append(l) # merge lemmatized_tokens = {} for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) new_tok_tags = [(l or t, pos) for (t, pos), l in zip(tok_tags, tok_lemmata)] assert len(new_tok_tags) == len(tok_tags) lemmatized_tokens[dl] = new_tok_tags assert len(lemmatized_tokens) == len(self._tokens) self._tokens = lemmatized_tokens def _task_expand_compound_tokens(self, split_chars=('-', ), split_on_len=2, split_on_casechange=False): tmp_tokens = {} for dl, dt in self._tokens.items(): nested = [ expand_compound_token(tup[0], split_chars, split_on_len, split_on_casechange) for tup in dt ] tmp_tokens[dl] = tuplize(flatten_list(nested)) self._tokens = tmp_tokens def _task_remove_special_chars_in_tokens(self, special_chars): self._tokens = { dl: apply_to_mat_column( dt, 0, lambda x: remove_special_chars_in_tokens(x, special_chars), map_func=False) if dt else [] for dl, dt in self._tokens.items() } def _task_clean_tokens(self, tokens_to_remove, save_orig_tokens=False, remove_shorter_than=None, remove_longer_than=None, remove_numbers=False): if save_orig_tokens: self._save_orig_tokens() if remove_shorter_than is not None: self._tokens = { dl: [t for t in dt if len(t[0]) >= remove_shorter_than] for dl, dt in self._tokens.items() } if remove_longer_than is not None: self._tokens = { dl: [t for t in dt if len(t[0]) <= remove_longer_than] for dl, dt in self._tokens.items() } if remove_numbers: self._tokens = { dl: [t for t in dt if not t[0].isnumeric()] for dl, dt in self._tokens.items() } if type( tokens_to_remove ) is not set: # using a set is much faster than other sequence types for "in" tests tokens_to_remove = set(tokens_to_remove) self._tokens = { dl: [t for t in dt if t[0] not in tokens_to_remove] for dl, dt in self._tokens.items() } def _task_filter_for_token(self, search_token, match_type='exact', ignore_case=False, glob_method='match', remove_found_token=False): self._save_orig_tokens() self._tokens = filter_for_token(self._tokens, search_token, match_type=match_type, ignore_case=ignore_case, glob_method=glob_method, remove_found_token=remove_found_token, remove_empty_docs=False) def _task_filter_for_pos(self, required_pos, pos_tagset, simplify_pos=True): self._save_orig_tokens() self._tokens = filter_for_pos(self._tokens, required_pos, simplify_pos=simplify_pos, simplify_pos_tagset=pos_tagset) def _task_reset_filter(self): self._tokens = self._orig_tokens self._orig_tokens = None def _save_orig_tokens(self): if self._orig_tokens is None: # initial filtering -> safe a copy of the original tokens self._orig_tokens = deepcopy(self._tokens)
class SentiDep: def __init__(self, **kwargs): """ Sentiment-Analyzer for german texts. Get the polarity values of words depending on polarity values of associated descriptive words e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne' Purpose: find out in which sentiment context your keywords appear in a text. Note: Works with spacy, nltk and germalemma """ sentiws_path = kwargs.get( 'sentiws_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/sentiws.pickle")) polarity_mod_path = kwargs.get( 'polarity_modifiers_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/polarity_modifiers.pickle")) negations_path = kwargs.get( 'negations_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/negationen_lexicon.pickle")) stts_path = kwargs.get( 'stts_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/stts.pickle")) self.sentiws = pickle.load(open(sentiws_path, 'rb')) self.polarity_modifications = pickle.load(open(polarity_mod_path, 'rb')) self.negations = pickle.load(open(negations_path, 'rb')) self.nlp = spacy.load("de_core_news_md") self.germalemmatizer = GermaLemma() self.stts = pickle.load(open(stts_path, 'rb')) self.german_stops = stopwords.words('german') def tokenize(self, text): """ Tokenize a string using spacy's tokenizer. Input: text/string Output: spacy_doc """ return self.nlp(text) def sentiws_spacy_tag_mapper(self, pos_tag, **kwargs): """ Function for mapping SentiWS POS-tags to spacy POS-tags and reverse. Input: pos_tag, optional: direction -> values: 1 (sentiws to spacy), -1 (spacy to sentiws) -> default: 1 Output: python str """ direction = kwargs.get('direction', 1) senti_map = { "ADJX": "ADJ", "ADV": "ADV", "NN": "NOUN", "VVINF": "VERB" } if direction > 0: return senti_map[pos_tag] elif direction < 0: return {value: key for key, value in senti_map.items()}[pos_tag] def get_polarity(self, word, pos_tag): """ Getter Function for retaining the polarity value by SentiWS for a certain word with POS-tag. Input: word, pos_tag Output: tuple(word, polarity-value, pos_tag) """ senti_words = list( filter( lambda x: x[0] == word and self.sentiws_spacy_tag_mapper(x[2]) == pos_tag, self.sentiws)) if senti_words: senti_words = sorted(senti_words, key=lambda y: y[1]**2, reverse=True)[0] return senti_words def modify_polarity(self, child, polarity): """ Function to consider polarity enhancer and reducer. Input: token.text, token.child.text, token.pos_ (of word) Output: tuple(word, polarity-value, pos_tag) """ senti_word = polarity if senti_word: if child in self.polarity_modifications["polarity_enhancer"]: return (senti_word[0], senti_word[1] * 1.5, senti_word[2]) elif child in self.polarity_modifications["polarity_reducer"]: return (senti_word[0], senti_word[1] * 0.5, senti_word[2]) def easy_switch(self, word): """ Function for finding depending negations without dealing with complex issues. Input: token/word Output: True/False """ neg_search = [ re.search(r'%s' % (n), word) for n in self.negations["negation_regex"] ] neg_search = list(filter(lambda z: z != None, neg_search)) return bool(neg_search) def add_polarities(self, list_of_polarity_tuples): """ Summing up a list of polarity-tuples :param list_of_polarity_tuples: :return: polarity value -> float """ all_pols = [lpt[1] for lpt in list_of_polarity_tuples] return sum(all_pols) def calc_parent_polarity(self, spacy_token, token_polarity, children_polarities): """ Calculating the parent polarity value depending on the children polarities :param spacy_token: :param token_polarity: :param children_polarities: :return: parent_polarity -> tuple(word, polarity, POS-tag) """ if token_polarity and children_polarities: added_children_polarities = self.add_polarities( children_polarities) if added_children_polarities > 0: token_polarity = (spacy_token.text, token_polarity[1] + added_children_polarities, spacy_token.pos_) elif added_children_polarities < 0: token_polarity = (spacy_token.text, (token_polarity[1] + (-1 * added_children_polarities)) * (-1), spacy_token.pos_) elif not token_polarity and children_polarities: token_polarity = (spacy_token.text, self.add_polarities(children_polarities), spacy_token.pos_) return token_polarity def switch_polarity(self, polarity, spacy_doc_sent): """ Switching polarity value depending on negation context of whole sentence. Classic negation (kein, nicht, ...) are recognized as well as negation stops (aber, obwohl, ...) :param polarity: :param spacy_doc_sent: :return: tuple(word, polarity, POS-tag, negation: boolean) """ negation_trigger = False for i, token in enumerate(spacy_doc_sent): for negex in self.negations['negation_regex']: regex = r'%s' % (negex) negation_search = re.search(regex, token.text, re.I) if negation_search: negation_trigger = not negation_trigger if token.lower_ in self.negations['polarity_switches']: if token.text == '.': if token.pos_ == 'PUNCT': negation_trigger = not negation_trigger else: continue else: negation_trigger = not negation_trigger if token.text == polarity[0]: if negation_trigger: negated_polarity = (polarity[0], -polarity[1], polarity[2], "negation: " + str(negation_trigger)) else: negated_polarity = (polarity[0], polarity[1], polarity[2], "negation: " + str(negation_trigger)) return negated_polarity def get_depending_polarities(self, text, keywords): """ Get keyword associated polarity values of german texts. Polarity analysis including polarity reducer/enhancer and negations :param text: :param keywords: :return: Context-polarity value of keywords -> list of tuples """ spacy_doc = self.nlp(text, disable=['ner', 'textcat']) parent_polarities = [] keywords = [k.lower() for k in keywords] for sent in spacy_doc.sents: for i, token in enumerate(sent): token_polarity = self.get_polarity(token.text, token.pos_) children_polarities = [] if token.lower_ in keywords: children = token.children if children: for child in children: child_polarity = self.get_polarity( child.text, child.pos_) if child_polarity: children_polarities.append(child_polarity) parent_polarity = self.calc_parent_polarity( token, token_polarity, children_polarities) if parent_polarity: modified_parent_polarities = [] for child in children: modified_parent_polarities.append( self.modify_polarity(child, parent_polarity)) added_modified_parent_polarity = None if modified_parent_polarities: added_modified_parent_polarity = self.add_polarities( modified_parent_polarities) if added_modified_parent_polarity: added_modified_parent_polarity = ( token.text, added_modified_parent_polarity, token.pos_ + "_modified") parent_polarities.append( self.switch_polarity( added_modified_parent_polarity, sent)) else: parent_polarities.append( self.switch_polarity(parent_polarity, sent)) parent_polarities = [(term.lower(), t_pol, t_pos, neg) for term, t_pol, t_pos, neg in parent_polarities] return parent_polarities def lemmatize(self, spacy_token): """ Lemmatizer using stts-tagset, spacy-token and GermaLemma. Input: spacy token -> german model Output: python str """ tag = spacy_token.tag_ if tag.startswith(('N', 'V', 'ADJ', 'ADV')) and tag in self.stts: return self.germalemmatizer.find_lemma(spacy_token.text, tag) else: return spacy_token.text def generate_topics(self, texts, num_topics=10): """ Generate a list with 30 most frequent nouns in a text. Input: text -> len(text) <= 50000 Output: nltk.FreqDist-object """ tokens = [[token for token in self.tokenize(text)] for text in texts] tokens = [[self.lemmatize(t) for t in token if t.pos_ == 'NOUN'\ and t.lower_ not in self.german_stops] for token in tokens] docs = [" ".join(t) for t in tokens] cv = CountVectorizer(max_df=0.85, max_features=10000) word_count_vector = cv.fit_transform(docs) tf = TfidfTransformer(smooth_idf=True, use_idf=True) tf.fit(word_count_vector) feature_names = cv.get_feature_names() tf_idf_scores = [] for doc in docs: cv_vector = cv.transform([doc]) tf_idf_vector = tf.transform(cv_vector) sorted_items = self.sort_coo(tf_idf_vector.tocoo()) keywords, scores = self.extract_topn_from_vector( feature_names, sorted_items, 10) tf_idf_scores += list(zip(keywords, scores)) tfidf_topics = sorted(tf_idf_scores, key=lambda x: x[1], reverse=False) return dict(tfidf_topics[:num_topics]) def sort_coo(self, coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) def extract_topn_from_vector(self, feature_names, sorted_items, topn=10): sorted_items = sorted_items[:topn] score_vals = [] feature_vals = [] for idx, score in sorted_items: score_vals.append(round(score, 3)) feature_vals.append(feature_names[idx]) results = {} for idx in range(len(feature_vals)): results[feature_vals[idx]] = score_vals[idx] return results, score_vals def create_clinic_polarity_dict(self, key_list, topics): """ Compute polarity scores document-wise :param key_list: list of polarity-scores and document-key -> form: [[polarity-scores_1, document-key_1] ...] -> hint: simple pandas dump with df[[polarity-values, document]].values.tolist() :param topics: list of keywords associated with a certain topic :return: polarities_dict in form: {document_key_1: polarities_1, ...} """ polarities = {} clinic_counter = {} for rl in tqdm(key_list): if not rl[1] in clinic_counter.keys(): clinic_counter[rl[1]] = 1 key = f'{rl[1]}_{clinic_counter[rl[1]]}' polarities[key] = self.get_depending_polarities(rl[0], topics) clinic_counter[rl[1]] += 1 return polarities def create_polarity_df(self, polarities, topics): """ Transforms polarity-scores from 'create_clinic_polarity_dict' output to a formatted pandas dataframe :param polarities: polarities-dict (output from 'create_clinic_polarity_dict') :param topics: list of keywords associated with a certain topic :return: polarity_df (formatted pandas dataframe) of form: columns: keywords/topics rows: document-keys values: float(polarity-scores) or np.nan """ filtered_polarities = [(clinic, polarity) for clinic, polarity in polarities.items() if polarity] columns = {t: [] for t in topics} ids = {"Klinik": []} for clinic, polarity in tqdm(filtered_polarities): ids["Klinik"].append(clinic) row = {t: [] for t in topics} for pol in polarity: row[pol[0].lower()] = pol[1] for word, p in row.items(): if not p: columns[word].append(np.nan) else: columns[word].append(p) for key, value in columns.items(): if len(value) < len(ids["Klinik"]) or len(value) > len( ids["Klinik"]): raise ValueError("Values in dict must have same length!") polarity_df = pd.DataFrame(data=columns, index=ids["Klinik"]) return polarity_df '''