def __init__(self, **kwargs): """ Sentiment-Analyzer for german texts. Get the polarity values of words depending on polarity values of associated descriptive words e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne' Purpose: find out in which sentiment context your keywords appear in a text. Note: Works with spacy, nltk and germalemma """ sentiws_path = kwargs.get( 'sentiws_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/sentiws.pickle")) polarity_mod_path = kwargs.get( 'polarity_modifiers_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/polarity_modifiers.pickle")) negations_path = kwargs.get( 'negations_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/negationen_lexicon.pickle")) stts_path = kwargs.get( 'stts_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/stts.pickle")) self.sentiws = pickle.load(open(sentiws_path, 'rb')) self.polarity_modifications = pickle.load(open(polarity_mod_path, 'rb')) self.negations = pickle.load(open(negations_path, 'rb')) self.nlp = spacy.load("de_core_news_md") self.germalemmatizer = GermaLemma() self.stts = pickle.load(open(stts_path, 'rb')) self.german_stops = stopwords.words('german')
def __init__(self, path: str = "src/data/", windowSize=5) -> None: self.path = path self.windowSize = windowSize self.df_aspect_tokens = None self.df_preprocessed = None self.df_lexicon = None self.lemmatizer = GermaLemma()
def annotate_stw(self, t, clf_class, majority_classes=None): """ Method for annotating a segment with one of the classes speech, thought or writing given the STWR classification clf_class. :param t: The text of the segment. :param clf_class: One of direct, indirect, free_indirect, reported. The predicted class for t. :param majority_classes: A dictionary containing the majority classes (one of speech, thought or writing) for each STWR class. :return: One of speech, thought or writing; the annotation for t. """ # Get the stored majority classes if no other are given if not majority_classes: majority_classes = self.majority_classes # Direct and free_indirect should always be classified by majority classes as reporting words are more # likely to appear outside of segments of these classes. if clf_class in ['direct', 'free_indirect']: return majority_classes[clf_class] # For the other types check for reporting words with unambiguous type else use majority class doc = NLP(t) # Get lemmata with germalemma as spacy is not good at this lemmatizer = GermaLemma() lemmata = [] for token in doc: if token.pos_ == "VERB": lemmata.append(lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": lemmata.append(lemmatizer.find_lemma(token.text, 'N')) if len(lemmata) > 0: stw_words_t = pd.concat([ self.stw_words[self.stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))] for lemma in lemmata ], axis=0, ignore_index=True) else: stw_words_t = [] if len(stw_words_t) == 1: if stw_words_t["Type"][0] in ["speech", "thought", "writing"]: return stw_words_t["Type"][0] else: return majority_classes[clf_class] else: return majority_classes[clf_class]
def lemmatize_tokens(tokens): lemmatizer = GermaLemma() new_tokens = {} for doc_label, tok_pos in tokens.items(): lemmata_pos = [] for t, pos in tok_pos: try: l = lemmatizer.find_lemma(t, pos) except ValueError: l = t lemmata_pos.append((l, pos)) new_tokens[doc_label] = lemmata_pos return new_tokens
def custom_extensions(doc): lemmatizer = GermaLemma() negation_words = set(["nie", "keinsterweise", "keinerweise", "niemals", "nichts", "kaum", "keinesfalls", "ebensowenig", "nicht", "kein", "keine", "weder"]) negation_cconj = set(['aber', 'jedoch', 'doch', 'sondern']) def lemma_getter(token): # if " " in token.text: # return token.lemma_.lower() try: return lemmatizer.find_lemma(token.text, token.tag_).lower() except: return token.lemma_.lower() def is_negation_getter(token): if token._.lemma in negation_words: return True else: return False def is_sentence_break_getter(token): if token._.lemma in negation_cconj: return True else: return False Token.set_extension("lemma", getter=lemma_getter, force=True) Token.set_extension("is_negation", getter=is_negation_getter, force=True) Token.set_extension("is_sentence_break", getter=is_sentence_break_getter, force=True) return doc
def __init__(self, language="english"): self.language = language self.tagger = Tagger() self.stopwords = stopwords.words(language) if self.language == "german": self.lemmatizer = GermaLemma() self.stopwords.append('dass') else: self.lemmatizer = WordNetLemmatizer()
class GermaLemma(PipelineModule): def __init__(self, pos_prereq): self.pos_prereq = pos_prereq self.lemmatizer = GermaLemma( tiger_corpus= 'resources/tiger_release_aug07.corrected.16012013.conll09') def targets(self): return {'lemma-germalemma'} def prerequisites(self): return {'token', self.pos_prereq} def make(self, prerequisite_data): tokens = prerequisite_data['token'] pos = prerequisite_data[self.pos_prereq] pattern1 = re.compile("^[NV]") pattern2 = re.compile("^(ADJ|ADV)") def lemmatize_token(t, postag): try: if pattern1.match(postag): return self.lemmatizer.find_lemma(t, postag) elif pattern2.match(postag): return self.lemmatizer.find_lemma(t, postag[:3]) else: return 0 except Exception as e: sys.stderr.write( f"Lemmatizing {t} ({postag}) raised exception: {e}\n") return 0 return { 'lemma-germalemma': list(map(lambda x: lemmatize_token(x[0], x[1]), zip(tokens, pos))) }
def create_dictionary(doc_labels, filename): def pipe(label): doc = nlp(gendocs(label)) res = [] for i, sent in enumerate(doc.sents): for j, token in enumerate(sent): Token.set_extension('lemma', getter=lemma_getter, force=True) if not token.is_punct and not token.is_digit and not token.is_space: tok = token._.lemma.lower() tok = tok.replace('.', '') res.append(tok) return res if os.path.isfile(filename): print('File already exists!') return # create gensim dict & BoW lemmatizer = GermaLemma() from src.d01_ana.analysis import load_data, gendocs def lemma_getter(token): try: return lemmatizer.find_lemma(token.text, token.tag_).lower() except: return token.lemma_.lower() # doc_labels = random.sample(doc_labels, 100) nlp = spacy.load("de_core_news_lg") docs = (pipe(label) for label in doc_labels) # tokens = [(token for token in doc) for doc in docs] tokens = ((token for token in doc) for doc in docs) dictionary = corpora.Dictionary() BoW_corpus = [dictionary.doc2bow(token, allow_update=True) for token in tokens] dictionary.save(filename) return dictionary
""" Tests for germalemma module. Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung January 2019 """ import pytest from germalemma import GermaLemma lemmatizer = GermaLemma() test_table = ( # known nouns (('US-Präsident', 'N'), 'US-Präsident'), (('US-Präsidenten', 'N'), 'US-Präsident'), (('EG-Staaten', 'N'), 'EG-Staat'), (('EG-Staaten', 'NP'), 'EG-Staat'), # unknown nouns (('US-Präsidentenhaus', 'N'), 'US-Präsidentenhaus'), (('US-Präsidentenhäuser', 'N'), 'US-Präsidentenhaus'), (('EU-Neu-Delegierte', 'N'), 'EU-Neu-Delegierter'), (('Feinstaubbelastungen', 'N'), 'Feinstaubbelastung'), # known adjectives (('fies', 'ADJ'), 'fies'), (('besser', 'ADJ'), 'gut'), (('schöne', 'ADJ'), 'schön'), # unknown adjectives (('unbeschreibliches', 'ADJ'), 'unbeschreiblich'), (('klagloser', 'ADJ'), 'klaglos'),
def _task_lemmatize(self, pos_tagset, use_dict=False, use_patternlib=False, use_germalemma=None): tmp_lemmata = defaultdict(list) if use_germalemma is None and self.language == 'german': use_germalemma = True if use_germalemma: if not self.germalemma: self.germalemma = GermaLemma() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: try: l = self.germalemma.find_lemma(t, pos) except ValueError: l = t tmp_lemmata[dl].append(l) else: if use_dict and self.lemmata_dict: for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: pos = simplified_pos(pos, tagset=pos_tagset) if pos: l = self.lemmata_dict.get(pos, {}).get(t, None) if l == '-' or l == '': l = None else: l = None tmp_lemmata[dl].append(l) if use_patternlib: if not self.pattern_module: if self.language not in PATTERN_SUBMODULES: raise ValueError( "no CLiPS pattern module for this language:", self.language) modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language] self.pattern_module = import_module(modname) for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) lemmata_final = [] for (t, pos), t_found in zip(tok_tags, tok_lemmata): l = t_found if l is None: if pos.startswith('NP'): # singularize noun l = self.pattern_module.singularize(t) elif pos.startswith('V'): # get infinitive of verb l = self.pattern_module.conjugate( t, self.pattern_module.INFINITIVE) elif pos.startswith('ADJ') or pos.startswith( 'ADV' ): # get baseform of adjective or adverb l = self.pattern_module.predicative(t) lemmata_final.append(l) tmp_lemmata[dl] = lemmata_final if len(tmp_lemmata) == 0: if not self.wordnet_lemmatizer: self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: wn_pos = pos_tag_convert_penn_to_wn(pos) if wn_pos: l = self.wordnet_lemmatizer.lemmatize(t, wn_pos) else: l = t tmp_lemmata[dl].append(l) # merge lemmatized_tokens = {} for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) new_tok_tags = [(l or t, pos) for (t, pos), l in zip(tok_tags, tok_lemmata)] assert len(new_tok_tags) == len(tok_tags) lemmatized_tokens[dl] = new_tok_tags assert len(lemmatized_tokens) == len(self._tokens) self._tokens = lemmatized_tokens
class _PreprocWorker(mp.Process): def __init__(self, worker_id, docs, language, tasks_queue, results_queue, tokenizer, stemmer, lemmata_dict, pos_tagger, group=None, target=None, name=None, args=(), kwargs=None): super(_PreprocWorker, self).__init__(group, target, name, args, kwargs or {}) logger.debug('worker `%s`: init with worker ID %d' % (name, worker_id)) logger.debug('worker `%s`: docs = %s' % (name, str(set(docs.keys())))) self.worker_id = worker_id self.docs = docs self.language = language self.tasks_queue = tasks_queue self.results_queue = results_queue # set a tokenizer self.tokenizer = tokenizer # tokenizer instance (must have a callable attribute `tokenize` with a document # text as argument) # set a stemmer self.stemmer = stemmer # stemmer instance (must have a callable attribute `stem`) # set a POS tagger self.pos_tagger = pos_tagger # POS tagger instance (must have a callable attribute `tag`) self.lemmata_dict = lemmata_dict self.pattern_module = None # dynamically loaded CLiPS pattern library module self.germalemma = None # GermaLemma instance self.wordnet_lemmatizer = None # nltk.stem.WordNetLemmatizer instance self._tokens = { } # tokens for this worker at the current processing stage. dict with document label -> tokens list self._ngrams = {} # generated ngrams #self._filtered = False self._orig_tokens = None # original (unfiltered) tokens, when filtering is currently applied def run(self): logger.debug('worker `%s`: run' % self.name) for next_task, task_kwargs in iter(self.tasks_queue.get, None): logger.debug('worker `%s`: received task `%s`' % (self.name, next_task)) exec_task_fn = getattr(self, '_task_' + next_task) if exec_task_fn: exec_task_fn(**task_kwargs) else: raise NotImplementedError("Task not implemented: `%s`" % next_task) self.tasks_queue.task_done() logger.debug('worker `%s`: shutting down' % self.name) self.tasks_queue.task_done() def _put_items_in_results_queue(self, container): if container: logger.debug('worker `%s`: putting %d results in queue' % (self.name, len(container))) for pair in container.items(): self.results_queue.put(pair) else: # we *have* to put something in the result queue -> signal that we return "nothing" logger.debug('worker `%s`: putting None in results queue' % self.name) self.results_queue.put(None) def _task_get_tokens(self): self._put_items_in_results_queue(self._tokens) def _task_get_tokens_with_worker_id(self): self.results_queue.put((self.worker_id, self._tokens)) def _task_get_ngrams(self): self._put_items_in_results_queue(self._ngrams) def _task_get_ngrams_with_worker_id(self): self.results_queue.put((self.worker_id, self._ngrams)) def _task_get_vocab_doc_freq(self): counts = Counter() for dt in self._tokens.values(): counts.update(set(ith_column(dt))) self.results_queue.put(counts) def _task_get_state(self): logger.debug('worker `%s`: getting state' % self.name) state_attrs = ('docs', 'language', '_tokens', '_ngrams', '_orig_tokens') state = {attr: getattr(self, attr) for attr in state_attrs} logger.debug('worker `%s`: got state with %d items' % (self.name, len(state))) self.results_queue.put(state) def _task_set_tokens(self, tokens): logger.debug('worker `%s`: setting tokens' % self.name) self._tokens = tokens def _task_set_ngrams(self, ngrams): logger.debug('worker `%s`: setting ngrams' % self.name) self._ngrams = ngrams def _task_set_state(self, **state): logger.debug('worker `%s`: setting state' % self.name) for attr, val in state.items(): setattr(self, attr, val) def _task_tokenize(self): self._tokens = { dl: tuplize(self.tokenizer.tokenize(txt)) for dl, txt in self.docs.items() } def _task_generate_ngrams(self, n, join=True, join_str=' '): self._ngrams = { dl: create_ngrams(ith_column(dt), n=n, join=join, join_str=join_str) for dl, dt in self._tokens.items() } def _task_use_ngrams_as_tokens(self, join=False, join_str=' '): if join: new_tok = { dl: tuplize([join_str.join(g_tuple) for g_tuple in dg]) for dl, dg in self._ngrams.items() } else: new_tok = {dl: tuplize(dg) for dl, dg in self._ngrams.items()} self._tokens = new_tok def _task_transform_tokens(self, transform_fn): self._tokens = { dl: apply_to_mat_column(dt, 0, transform_fn) if dt else [] for dl, dt in self._tokens.items() } def _task_stem(self): self._tokens = { dl: apply_to_mat_column(dt, 0, lambda t: self.stemmer.stem(t)) if dt else [] for dl, dt in self._tokens.items() } def _task_pos_tag(self): self._tokens = { dl: apply_to_mat_column( dt, 0, self.pos_tagger.tag, map_func=False, expand=True) if dt else [] for dl, dt in self._tokens.items() } def _task_lemmatize(self, pos_tagset, use_dict=False, use_patternlib=False, use_germalemma=None): tmp_lemmata = defaultdict(list) if use_germalemma is None and self.language == 'german': use_germalemma = True if use_germalemma: if not self.germalemma: self.germalemma = GermaLemma() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: try: l = self.germalemma.find_lemma(t, pos) except ValueError: l = t tmp_lemmata[dl].append(l) else: if use_dict and self.lemmata_dict: for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: pos = simplified_pos(pos, tagset=pos_tagset) if pos: l = self.lemmata_dict.get(pos, {}).get(t, None) if l == '-' or l == '': l = None else: l = None tmp_lemmata[dl].append(l) if use_patternlib: if not self.pattern_module: if self.language not in PATTERN_SUBMODULES: raise ValueError( "no CLiPS pattern module for this language:", self.language) modname = 'pattern.%s' % PATTERN_SUBMODULES[self.language] self.pattern_module = import_module(modname) for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) lemmata_final = [] for (t, pos), t_found in zip(tok_tags, tok_lemmata): l = t_found if l is None: if pos.startswith('NP'): # singularize noun l = self.pattern_module.singularize(t) elif pos.startswith('V'): # get infinitive of verb l = self.pattern_module.conjugate( t, self.pattern_module.INFINITIVE) elif pos.startswith('ADJ') or pos.startswith( 'ADV' ): # get baseform of adjective or adverb l = self.pattern_module.predicative(t) lemmata_final.append(l) tmp_lemmata[dl] = lemmata_final if len(tmp_lemmata) == 0: if not self.wordnet_lemmatizer: self.wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() for dl, tok_tags in self._tokens.items(): for t, pos in tok_tags: wn_pos = pos_tag_convert_penn_to_wn(pos) if wn_pos: l = self.wordnet_lemmatizer.lemmatize(t, wn_pos) else: l = t tmp_lemmata[dl].append(l) # merge lemmatized_tokens = {} for dl, tok_tags in self._tokens.items(): tok_lemmata = tmp_lemmata.get(dl, [None] * len(tok_tags)) new_tok_tags = [(l or t, pos) for (t, pos), l in zip(tok_tags, tok_lemmata)] assert len(new_tok_tags) == len(tok_tags) lemmatized_tokens[dl] = new_tok_tags assert len(lemmatized_tokens) == len(self._tokens) self._tokens = lemmatized_tokens def _task_expand_compound_tokens(self, split_chars=('-', ), split_on_len=2, split_on_casechange=False): tmp_tokens = {} for dl, dt in self._tokens.items(): nested = [ expand_compound_token(tup[0], split_chars, split_on_len, split_on_casechange) for tup in dt ] tmp_tokens[dl] = tuplize(flatten_list(nested)) self._tokens = tmp_tokens def _task_remove_special_chars_in_tokens(self, special_chars): self._tokens = { dl: apply_to_mat_column( dt, 0, lambda x: remove_special_chars_in_tokens(x, special_chars), map_func=False) if dt else [] for dl, dt in self._tokens.items() } def _task_clean_tokens(self, tokens_to_remove, save_orig_tokens=False, remove_shorter_than=None, remove_longer_than=None, remove_numbers=False): if save_orig_tokens: self._save_orig_tokens() if remove_shorter_than is not None: self._tokens = { dl: [t for t in dt if len(t[0]) >= remove_shorter_than] for dl, dt in self._tokens.items() } if remove_longer_than is not None: self._tokens = { dl: [t for t in dt if len(t[0]) <= remove_longer_than] for dl, dt in self._tokens.items() } if remove_numbers: self._tokens = { dl: [t for t in dt if not t[0].isnumeric()] for dl, dt in self._tokens.items() } if type( tokens_to_remove ) is not set: # using a set is much faster than other sequence types for "in" tests tokens_to_remove = set(tokens_to_remove) self._tokens = { dl: [t for t in dt if t[0] not in tokens_to_remove] for dl, dt in self._tokens.items() } def _task_filter_for_token(self, search_token, match_type='exact', ignore_case=False, glob_method='match', remove_found_token=False): self._save_orig_tokens() self._tokens = filter_for_token(self._tokens, search_token, match_type=match_type, ignore_case=ignore_case, glob_method=glob_method, remove_found_token=remove_found_token, remove_empty_docs=False) def _task_filter_for_pos(self, required_pos, pos_tagset, simplify_pos=True): self._save_orig_tokens() self._tokens = filter_for_pos(self._tokens, required_pos, simplify_pos=simplify_pos, simplify_pos_tagset=pos_tagset) def _task_reset_filter(self): self._tokens = self._orig_tokens self._orig_tokens = None def _save_orig_tokens(self): if self._orig_tokens is None: # initial filtering -> safe a copy of the original tokens self._orig_tokens = deepcopy(self._tokens)
# POS tagging (time-consuming!) #TODO: maybe use faster POS-tagging, e.g. NLTK tagger or ClassifierBasedGermanTagger using TIGER corpus, but spacy has higher accuracy nlp = spacy.load('de_core_news_md', disable=['ner', 'parser']) df_articles['Article_POS'] = df_articles['Article'].apply(lambda x: nlp(x)) # Create new column including only nouns (all noun types from STTS tagset) df_articles['Nouns'] = df_articles['Article_POS'].apply( lambda x: [token for token in x if token.tag_.startswith('NN')]) # remove words with length==1 df_articles['Nouns'] = df_articles['Nouns'].apply( lambda x: [word for word in x if len(x) > 1]) # df_articles['Nounverbs'] = df_articles['Nounverbs'].apply(lambda x: [word for word in x if len(x)>1]) # Lemmatization lemmatizer = GermaLemma() # Lemmatization of Nouns noun_list = df_articles['Nouns'].tolist() global noun_lemma_list noun_lemma_list = [] for doc in noun_list: noun_lemma_list.append([]) for token in doc: token_lemma = lemmatizer.find_lemma(token.text, token.tag_) token_lemma = token_lemma.lower() noun_lemma_list[-1].append(token_lemma) # Save to help df df_help_noun_lemma_list = pandas.DataFrame({'x': noun_lemma_list})
#!/usr/bin/python2.7 # -*- coding: utf-8 -*- from germalemma import GermaLemma import pickle from ClassifierBasedGermanTagger.ClassifierBasedGermanTagger import ClassifierBasedGermanTagger lemmatizer = GermaLemma() # passing the word and the POS tag ("N" for noun) with open('data/pos.pickle', 'rb') as f: tagger = pickle.load(f) pos = tagger.tag(['Jungen', u'Wände', u'Wänden']) print(pos) for item in pos: w, p = item print(lemmatizer.find_lemma(w, p)) #lemma = lemmatizer.find_lemma(u'Jungen', u'N') #print(lemma)
def analysis(doc_labels): def gendocs(label): with open('data/corpus_clean/{}.txt'.format(label), "r") as text_file: return text_file.read() # %% # %% nlp = spacy.load("de_core_news_lg") # %% lemmatizer = GermaLemma() def lemma_getter(token): try: return lemmatizer.find_lemma(token.text, token.tag_) except: return token.lemma_ def is_neg_elite(token): global found if token._.is_elite_noneg: found.append((token.text, None)) return True elif token._.is_elite: check = list(token.children) # if token.head: # check.append(token.head) node = token while node.head: seen = node if seen == node.head: break else: check.append(node) node = seen.head attr_neg = [child for child in check if child._.lemma.lower() in negativ] if attr_neg: found.append((token.text, attr_neg)) return True else: return False # return any([True for child in check if child._.lemma.lower() in negativ]) else: return False def is_volk(token): global found # if token.pos_ == 'NOUN' or token.pos_ == 'PROPN': # print(token._.lemma) check = list(token.children) if token._.lemma.lower() in people: found.append((token.text, None)) return True elif token._.lemma.lower() in people_ordinary: attr_ppl = [child for child in check if child._.lemma.lower() in attribut_ordinary] if attr_ppl: found.append((token.text, attr_ppl)) # print('found attr_ppl') return True else: return False elif token._.lemma.lower() in people_ger: attr_ger = [child for child in check if child._.lemma.lower() in attribut_ger] if attr_ger: found.append((token.text, attr_ger)) # print('found ppl_ger') return True else: return False else: return False people = set(people) people_ordinary = set(people_ordinary) people_ger = set(people_ger) attr_ger = set(attribut_ger) elite = [*elite_pol, *elite_eco, *elite_experten, *elite_medien] elite = set(elite) elite_noneg = set(elite_noneg) negativ = set(neg_dict.keys()) positiv = set(pos_dict.keys()) dfs = [] all_sents = [] res = [] # doc_labels = doc_labels[1000:1500] # doc_labels = random.sample(doc_labels, 100) for label in tqdm(doc_labels): res_dict = {'doc': None, 'len': None, 'pop': False, 'volk': 0, 'elite': 0, 'sents': None, 'volk_': None, 'elite_': None, 'lemma_pop': None} found = [] doc = nlp(gendocs(label)) hits = {'volk': [], 'elite': []} for i, sent in enumerate(doc.sents): # print(sent) for j, token in enumerate(sent): # is_volk_getter = lambda token: token._.lemma.lower() in volk is_elite_getter = lambda token: token._.lemma.lower() in elite is_elite_noneg_getter = lambda token: token._.lemma.lower() in elite_noneg is_neg_getter = lambda token: token._.lemma.lower() in negativ is_pos_getter = lambda token: token._.lemma.lower() in positiv Token.set_extension('is_neg', getter=is_neg_getter, force=True) Token.set_extension('is_pos', getter=is_pos_getter, force=True) Token.set_extension('is_elite', getter=is_elite_getter, force=True) Token.set_extension('is_elite_noneg', getter=is_elite_noneg_getter, force=True) Token.set_extension('lemma', getter=lemma_getter, force=True) is_volk_getter = lambda token: is_volk(token) is_neg_elite_getter = lambda token: is_neg_elite(token) Token.set_extension('is_volk', getter = is_volk_getter, force=True) Token.set_extension('is_neg_elite', getter = is_neg_elite_getter, force=True) if token._.is_volk: hits['volk'].append(token._.lemma) if token._.is_neg_elite: hits['elite'].append(token._.lemma) all_sents.append(sent) # Token.set_extension('is_pos_volk', getter=is_pos_volk_getter_func, force=True) # print(token.text, token.lemma_, token._.lemma, token.pos_) # print(list(token.children)) # print(found) matcher = Matcher(nlp.vocab) pattern = [{'_': {'is_neg_elite': True}}] matcher.add('text', None, pattern) matches = matcher(doc) has_pop = [] tokens_pop = [] for match_id, start, end in matches: span = doc[start-280:end+280] for token in span: if token._.is_volk: tokens_pop.append(doc[start]._.lemma) tokens_pop.append(token._.lemma) sentence_start = span[0].sent.start sentence_end = span[-1].sent.end has_pop.append(doc[sentence_start : sentence_end].text) c_volk = Counter(([token._.is_volk for token in doc])) c_neg_elite = Counter(([token._.is_neg_elite for token in doc])) tokens_pop_counter = Counter(tokens_pop) if has_pop: res_dict['pop'] = True res_dict['doc'] = label res_dict['sents'] = has_pop res_dict['elite'] = c_neg_elite[True] res_dict['volk'] = c_volk[True] res_dict['len'] = len(doc) res_dict['volk_'] = hits['volk'] res_dict['elite_'] = hits['elite'] res_dict['volk_counter'] = Counter(hits['volk']) res_dict['elite_counter'] = Counter(hits['elite']) res_dict['hits'] = found res_dict['lemma_pop'] = tokens_pop_counter res.append(res_dict)
print("running 10 randomized evaluations") pct_success_all_trials = [] incorrect_lemmata = [] known_incorrect_lemmata_tokens = set() for _ in range(10): shuffle(all_tokens) n_split = int(len(all_tokens) * 0.9) tokens_a, tokens_b = all_tokens[:n_split], all_tokens[n_split:] # build lemmatizer with tokens_a lemmata = defaultdict(dict) lemmata_lower = defaultdict(dict) for token, lemma, pos in tokens_a: GermaLemma.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos) lemmatizer = GermaLemma(lemmata=lemmata, lemmata_lower=lemmata_lower) # test lemmatizer with tokens_b n_success = 0 for token, true_lemma, pos in tokens_b: found_lemma = lemmatizer.find_lemma(token, pos) if found_lemma == true_lemma: n_success += 1 elif found_lemma != token and token not in known_incorrect_lemmata_tokens: incorrect_lemmata.append((token, found_lemma, true_lemma)) known_incorrect_lemmata_tokens |= {token} n_all = len(tokens_b)
def main(): # train if os.path.exists('./resources/nltk_german_classifier_data.pickle'): with open('./resources/nltk_german_classifier_data.pickle', 'rb') as f: print('./resources/nltk_german_classifier_data.pickle found') tagger = pickle.load(f) else: print( 'could not find ./resources/nltk_german_classifier_data.pickle: training: IN PROGRESS' ) tagger = train() with open('./resources/nltk_german_classifier_data.pickle', 'wb') as f: pickle.dump(tagger, f, protocol=2) print('training FINISHED') # tokenize if os.path.exists('./data/1.pickle'): with open('./data/1.pickle', 'rb') as f: print('1.pickle found') words = pickle.load(f) else: print('could not find 1.pickle: tokenizing: IN PROGRESS') document = open('./resources/logik-band-eins.txt').read() tok = Tokenizer() tokens = tok.tokenize(document) words = [] i = 0 for token in tokens: if i < 10000: v = token.value if len(v) > 1 and (not str.isdigit(v)) or True: words.append(v) # i = i + 1 else: break with open('./data/1.pickle', 'wb') as f: pickle.dump(words, f, protocol=2) print('tokenizing FINISHED') # tag if os.path.exists('./data/2.pickle'): with open('./data/2.pickle', 'rb') as f: print('2.pickle found') tagged_words = pickle.load(f) else: print('could not find 2.pickle: tagging: IN PROGRESS') tagged_words = tagger.tag(words) with open('./data/2.pickle', 'wb') as f: pickle.dump(tagged_words, f, protocol=2) # filter-in As, Ns, and Vs if os.path.exists('./data/3.pickle'): with open('./data/3.pickle', 'rb') as f: print('3.pickle found') filtered_words = pickle.load(f) else: print('could not find 3.pickle: filtering: IN PROGRESS') parts_of_speech = [ 'ADJA', 'ADJD', 'NN', 'NN', ] filtered_words = list( filter( lambda word: word[1][0] == 'V' or any( pos == word[1] for pos in parts_of_speech), tagged_words)) with open('./data/3.pickle', 'wb') as f: pickle.dump(filtered_words, f, protocol=2) # lemmatize if os.path.exists('./data/4.pickle'): with open('./data/4.pickle', 'rb') as f: print('4.pickle found') lemmatized_words = pickle.load(f) else: print('could not find 4.pickle: lematization: IN PROGRESS') lemmatizer = GermaLemma() lemmatized_words = [] for word in filtered_words: try: lemmatized_words.append(lemmatizer.find_lemma( word[0], word[1])) except: w = word[0] l = word[1] print(f"EXCEPT: {w} {l}") continue with open('./data/4.pickle', 'wb') as f: pickle.dump(lemmatized_words, f, protocol=2) # filter-out modals f = open('./resources/modal-words.txt', 'r') modal_words = f.read().splitlines()[:1000] non_modals = [item for item in lemmatized_words if item not in modal_words] # non_modals = list(filter(lambda word: not any(modal == word for modal in modals), lemmatized_words)) # modals = [] # line = f.readline() # modals.append(line) # while line: # line = f.readline() # modals.append(line) for pair in Counter(non_modals).most_common(30): print(pair[0] + " " + str(pair[1]))
class STWRFeatureExtractor(object): """ Feature extractor for classifiying STWR. """ def __init__(self, sequence_features=True): """ :param sequence_features: If true, use the sequence features (trained on gold labels). """ # Number of features self.num_features = 243 # Names of features - needed for feature inspection self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM", "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA", "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS", "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN", "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT", "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE", "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART", "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY", "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF", "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS", "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP", "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL", "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV", "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ", "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN", "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end", "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes", "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3", "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind", "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden", "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker", "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1", "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1", "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun", "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2", "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1", "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5", "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5", "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5", "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1", "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5", "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4", "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2", "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1", "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5", "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1", "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5", "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3", "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1", "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5", "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4", "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0", "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4", "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2", "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5", "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg", "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect", "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported", "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported", "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars", "paragraph", "prev_paragraph"] # Switch to turn off sequence features self.sequence_features = sequence_features if not self.sequence_features: self.feature_names = self.feature_names[:-21] + self.feature_names[-8:] # Get all possible tags self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys()) self.pos_map = sorted(spacy.parts_of_speech.NAMES.values()) # Set up lemmatizer self.lemmatizer = GermaLemma() # Set up RFTagger call(["make"], cwd="RFTagger/src") # Load word vectors print("Loading word-vectors. This may take a while ...") self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True) print("Done.\n") def transform(self, text, original_text = None, backlog=[]): """ Method that transforms the given segments into their feature representation. Expects dataframe with column ["text"] or list of spacy tokens along with the original text or string. :param text: dataframe with column ["text"] that contains the string segments or list of spacy tokens. :param original_text: the original text as string is passed in test mode. :param backlog: For test mode, the backlog stores info and labels of former segments and therefore has to be passed back and forth between classifier and feature extractor. :return: The transformed segments as pandas Dataframe or list, depending on the type of 'text' """ # If the backlog has not been initialized, initialize it if len(backlog) == 0: backlog = ["" for i in range(10)] + [0 for i in range(64)] # If spacy tokenization and quote annotation has not been performed, do it now if type(text) == list: tokens = text elif type(text) == pd.DataFrame: # Get full text for better results in spacy parsing full_text = " ".join(text['text'].values) doc = NLP(full_text) # Exchange tags for quotation marks for special tokens: #OPEN_QUOTE#, #CLOSE_QUOTE# doc = annotate_quotes(doc) tokens_full_text = [token for token in doc] # Transform individual segments if type(text) == list: return self.transform_segment(tokens, backlog, original_text) else: output = pd.DataFrame() print("Extracting features...") for ind, row in text.iterrows(): # print progress bar sys.stdout.write('\r') # the exact output you're looking for: sys.stdout.write("[%-20s] %d%%" % ('=' * round(ind/(len(text)/20)), round(ind/(len(text)/100)))) sys.stdout.flush() # Get the tokens corresponding to the segment: tokens_text = string_tokenize(row['text']) tokens = tokens_full_text[:len(tokens_text)] # Check that this is correct assert tokens_text[-1] == tokens[-1].text tokens_full_text = tokens_full_text[len(tokens_text):] transformed, backlog = self.transform_segment(tokens, backlog, row['text']) output = output.append(pd.Series(transformed), ignore_index = True) # Adapt backlog: backlog stores last ten classifications in the first ten positions backlog[0:10] = backlog[1:10] + [row['labels_spans']] return output, backlog def transform_segment(self, tokens, backlog, original_text): """ Transforms an individual segment of tokens, given the information in the backlog, into a feature representation. :param tokens: list of spacy tokens :param backlog: list containing information about the labels and other features of previous segments :param original_text: The original text as string :return: the feature representation and the updated backlog """ # --- Preprocessing --- transformed = [] token_strings = [token.text for token in tokens] # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV token_lemmata = [] for token in tokens: if token.pos_ == "VERB": token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": token_lemmata.append(self.lemmatizer.find_lemma(token.text, 'N')) elif token.pos_ in ["ADJ", "ADV"]: token_lemmata.append(self.lemmatizer.find_lemma(token.text, token.pos_)) else: token_lemmata.append(token.text) # Load reporting word list stw_words_orig = pd.read_excel("data/stw_words/stw_words_brunner2015.xls") # Some words are only usable for reported class stw_words_rep = stw_words_orig[stw_words_orig['Marker'] == 'rep'] stw_words = stw_words_orig[stw_words_orig['Marker'] != 'rep'] # Do deeper morphological analysis with RFTagger file = open("RFTagger/temp.txt", "w") file.write("\n".join(token_strings)) file.close() morph_tagged = check_output(["src/rft-annotate", "lib/german.par", "temp.txt"], cwd="RFTagger", stderr=FNULL).decode( "utf-8").split("\n") # Split morph tags into attributes morph_tagged = [morph_tag.split("\t")[1].split(".") if morph_tag != "" else morph_tag for morph_tag in morph_tagged] # --- Pos tag features --- tags = [token.tag_ for token in tokens] pos = [token.pos_ for token in tokens] transformed += [(tags.count(tag)/len(tags)) if tag in tags else 0 for tag in self.tag_map] transformed += [(pos.count(p) / len(pos)) if p in pos else 0 for p in self.pos_map] # --- NE features --- doc = NLP(original_text) transformed.append(len(doc.ents)) for ne_type in NE_TYPES: transformed.append(int(len([ent for ent in doc.ents if ent.label_ == ne_type]) > 0)) # --- Special token features --- # Colon in this or in previous segment? colon_this = int(":" in token_strings) transformed.append(colon_this) transformed.append(backlog[10]) # Comma at the end of this segment means that the next segment is an embedded sentence if it has a verb comma_end = int(tags[-1] == '$,') transformed.append(comma_end) # Percentage of 'emphatic' punctuation marks: ?,!,-,– transformed.append((token_strings.count('?') + token_strings.count('!') + token_strings.count('-') + token_strings.count('–'))/len(token_strings)) # Question? transformed.append(int((token_strings.count('?') > 0))) # Quotes features # Opening Quotes in this segment? open_quote = len([tag for tag in tags if tag == "#OPEN_QUOTE#"]) # Closing Quotes in this segment? close_quote = len([tag for tag in tags if tag == "#CLOSE_QUOTE#"]) # In quotes? in_quotes = int(backlog[11] > 0 or open_quote > 0) transformed.append(open_quote) transformed.append(close_quote) transformed.append(in_quotes) # How many contiguous prev. segments have been in quotes so far? This is meant to tackle errors bc of missing closing quotes # as well as marking sequences of embedded narration transformed.append(backlog[49]) # Special combinations direct - full quoted sentence (sent. ending punct. before closing quotes), # comma after closing quotes (prob. frame of direct speech) transformed.append(int(len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i > 0 and tags[i-1] == "$."]) > 0)) transformed.append(int((backlog[12] == 1 and token_strings[0] == ",") or (len([tag for i, tag in enumerate(tags) if tag == "#CLOSE_QUOTE#" and i < len(token_strings)-1 and token_strings[i+1] == ","]) > 0))) # --- Morphological Features --- # percentage of first and second person pronouns (personal, possessive, reflexive) per1 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '1'] per2 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '2'] per12 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] in ['1', '2']] transformed.append(len(per1) / len(token_strings)) # Second person might be a better feature than 1. and 2. together as it is seldom the perspective of a narrative transformed.append(len(per2) / len(token_strings)) transformed.append(len(per12)/len(token_strings)) # percentage of third person pronouns (personal, possessive, reflexive) per3 = [morph_tag for morph_tag in morph_tagged if len(morph_tag) > 2 and morph_tag[0] == 'PRO' and morph_tag[1] in ['Pers', 'Pos', 'Refl'] and morph_tag[3] == '3'] transformed.append(len(per3) / len(token_strings)) # Note changes in the usage of person; this might help to distinguish between third and first person perspective narratives # Only third person in prev. five segments? transformed.append(int(len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b in ['1', '1_3']]) == 0)) # Only first person in prev. five segments? transformed.append(int(len([b for b in backlog[43:48] if b == '1']) > 0 and len([b for b in backlog[43:48] if b in ['3', '1_3']]) == 0)) # Mixed first and third person in prev. five segments transformed.append(int(len([b for b in backlog[43:48] if b == '3_1']) > 0 or (len([b for b in backlog[43:48] if b == '3']) > 0 and len([b for b in backlog[43:48] if b == '1']) > 0))) # tempus and modus features has_ind = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[5] == 'Ind']) > 0) has_subj = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[5] == 'Subj']) > 0) no_subj = int(not any([morph_tag[5] == 'Subj' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) no_ind = int(not any([morph_tag[5] == 'Ind' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) has_pres = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[4] == 'Pres']) > 0) has_past = int(len([morph_tag for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN' and morph_tag[4] == 'Past']) > 0) no_past = int(not any([morph_tag[4] == 'Past' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) no_pres = int(not any([morph_tag[4] == 'Pres' for morph_tag in morph_tagged if len(morph_tag) > 5 and morph_tag[0] == 'VFIN'])) for feature in [has_ind, has_subj, no_subj, no_ind, has_pres, has_past, no_past, no_pres]: transformed.append(feature) # --- Grammatical features --- # Comma at the end of the prev. segment means that this segment is an embedded sentence if it has a verb if backlog[13] and any([tag in ['VFIN', 'VAFIN'] for tag in tags]): transformed.append(1) else: transformed.append(0) # A form of verb 'würden' + infinitive can be a pointer towards free indirect transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata]) and any( [(tag in ['VAINF', 'VMINF', 'VVINF', 'VVIZU'] and token_lemmata[i] != 'würden') for i, tag in enumerate(tags)]))) transformed.append(int(any([lemma == 'würden' for lemma in token_lemmata]))) # Noun/prepositional complements of a rep. word point toward reported STW, # sentence/infinitive complements point towards indirect STW all_stw_words = [token for i,token in enumerate(tokens) if any(stw_words_orig["Word"].str.contains(r'\b{}\b'.format(re.escape(token_lemmata[i]))))] has_prep_noun_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.pos_ in ['ADP', 'PROPN', 'NOUN'] and child.dep_.startswith('o')]) > 0]) > 0) has_claus_inf_comp = int(len([rep_word for rep_word in all_stw_words if len([child for child in rep_word.children if child.dep_ == 'oc']) > 0]) > 0) transformed.append(has_prep_noun_comp) transformed.append(has_claus_inf_comp) # --- Possible speaker features --- # Is subject a pronoun, a person NE or a "Person" head noun -> possible speaker cand_speakers = [tokens[i] for i,tag in enumerate(tags) if (tag in['PPER', 'PIS', 'PDS'] or (tag in ['NE', 'NNE'] and 'PER' in [ent for ent in doc.ents if tokens[i].idx >= ent.start and tokens[i].idx <= ent.end]))] # Check whether any noun phrase has a head that is a synset of "Person" in Germanet person = [] with open('data/person.txt', 'r', encoding='utf-8') as f: for l in f: person.append(l) for np in doc.noun_chunks: if np.root.text in person: cand_speakers.append(np.root) subj_cand_speaker = [token for token in cand_speakers if token.dep_ == 'sb'] # How many possible speakers/addressees are there in relation to the segment length? num_cand_speaker = len(cand_speakers)/len(tokens) transformed.append(int(len(subj_cand_speaker) > 0)) transformed.append(num_cand_speaker) # Append prev. segments candidate speaker features transformed.append(backlog[38]) transformed.append(backlog[39]) # --- Reporting word features --- # Appearance of reporting word by penalty has_rep_word_0 = int(any([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_1 = int(any([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_2 = int(any([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_3 = int(any([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_4 = int(any([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_5 = int(any([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of reporting word lower or equal a certain penalty has_rep_word_le_1 = int(any([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_2 = int(any([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_3 = int(any([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_4 = int(any([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_rep_word_le_5 = int(any([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect' has_rep_word_noun = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata])) has_rep_word_verb = int(any([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata])) for feature in [has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5, has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5, has_rep_word_noun, has_rep_word_verb]: transformed.append(feature) # Appearance of special reporting words for reported class by penalty has_spec_rep_word_0 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) # Appearance of special reporting words lower or equal a certain penalty has_spec_rep_word_le_1 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_2 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_3 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_4 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) has_spec_rep_word_le_5 = int(any([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata])) for feature in [has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4, has_spec_rep_word_5, has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4, has_spec_rep_word_le_5]: transformed.append(feature) # Number of reporting word by penalty num_rep_word_0 = sum([stw_words[stw_words["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_1 = sum([stw_words[stw_words["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_2 = sum([stw_words[stw_words["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_3 = sum([stw_words[stw_words["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_4 = sum([stw_words[stw_words["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_5 = sum([stw_words[stw_words["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of reporting word lower or equal a certain penalty num_rep_word_le_1 = sum([stw_words[stw_words["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_2 = sum([stw_words[stw_words["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_3 = sum([stw_words[stw_words["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_4 = sum([stw_words[stw_words["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_rep_word_le_5 = sum([stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of noun/verb reporting word -> this might be interesting to differentiate 'reported' from 'direct/'indirect' num_rep_word_noun = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.istitle())]) > 0) for lemma in token_lemmata]) num_rep_word_verb = sum([(len(stw_words[(stw_words[stw_words["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma)))) & (stw_words[stw_words["Penalty"] <= 5]["Word"].str.islower())]) > 0) for lemma in token_lemmata]) for feature in [num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4, num_rep_word_5, num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4, num_rep_word_le_5, num_rep_word_noun, num_rep_word_verb]: transformed.append(feature) # Number of special reporting words for reported class by penalty num_spec_rep_word_0 = sum([stw_words_rep[stw_words_rep["Penalty"] == 0]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_1 = sum([stw_words_rep[stw_words_rep["Penalty"] == 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_2 = sum([stw_words_rep[stw_words_rep["Penalty"] == 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_3 = sum([stw_words_rep[stw_words_rep["Penalty"] == 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_4 = sum([stw_words_rep[stw_words_rep["Penalty"] == 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_5 = sum([stw_words_rep[stw_words_rep["Penalty"] == 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) # Number of special reporting words lower or equal a certain penalty num_spec_rep_word_le_1 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 1]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_2 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 2]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_3 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 3]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_4 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 4]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) num_spec_rep_word_le_5 = sum([stw_words_rep[stw_words_rep["Penalty"] <= 5]["Word"].str.contains(r'\b{}\b'.format(re.escape(lemma))).any() for lemma in token_lemmata]) for feature in [num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3, num_spec_rep_word_4, num_spec_rep_word_5, num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3, num_spec_rep_word_le_4, num_spec_rep_word_le_5]: transformed.append(feature) # Reporting word features prev. segment for feature in backlog[14:38]: transformed.append(feature) for feature in backlog[50:74]: transformed.append(feature) # Word vectors # Get prototypical word vector for reporting words proto_rep_vec = numpy.average([self.wordvecs[word] for word in stw_words[stw_words["Penalty"] == 0] if word in self.wordvecs], axis=0) # Get prototypical word vector for reported class proto_rep_vec_reporting = numpy.average([self.wordvecs[word] for word in stw_words_rep[stw_words_rep["Penalty"] == 0] if word in self.wordvecs], axis=0) # Append highest similarity values to proto word vectors within the segment max_sim = .0 max_sim_rep = .0 for lemma in token_lemmata: if lemma in self.wordvecs: lemma_vec = self.wordvecs[lemma] # cosine similarity = 1 - cosine distance sim = 1 - distance.cosine(lemma_vec, proto_rep_vec) sim_rep = 1 - distance.cosine(lemma_vec, proto_rep_vec_reporting) if sim > max_sim: max_sim = sim if sim_rep > max_sim_rep: max_sim_rep = sim_rep transformed.append(max_sim) transformed.append(max_sim_rep) # --- Other word features --- # Usage of deictic words can point to character speech - precentage of deictic words transformed.append(len([t for t in token_strings if t in DEICTIC])/len(token_strings)) # Usage of special conjunction at the beginning of the segment can point to indirect transformed.append(int(token_strings[0] in CONJUNCT)) # Usage of modal particles can point towards character speech transformed.append(len([t for t in token_strings if t in MODAL_PART])/len(token_strings)) # Negation? transformed.append(len([lemma for lemma in token_lemmata if lemma in NEG])/len(token_strings)) # Words describing facial expressions, gestures, voice might hint towards STWR transformed.append(int(len([lemma for lemma in token_lemmata if lemma in FACIAL]) > 0)) transformed.append(int(len([lemma for lemma in token_lemmata if lemma in GESTURE]) > 0)) transformed.append(int(len([lemma for lemma in token_lemmata if lemma in VOICE]) > 0)) # The repetition of words can hint towards figural speech transformed.append(int(any([count >= 2 for count in [token_lemmata.count(el) for el in token_lemmata]]))) # --- Sequential features --- if self.sequence_features: # Labels of prev. segment labels_last = [l for i,l in enumerate(backlog[9].split(",")) if i%3==0] transformed.append(int(any([l.startswith('direct') for l in labels_last]))) transformed.append(int(any([l.startswith('indirect') for l in labels_last]))) transformed.append(int(any([l.startswith('free_indirect') for l in labels_last]))) transformed.append(int(any([l.startswith('reported') for l in labels_last]))) # Label appears in 5 prev. segments labels_last_5 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[5:10]] for fin_l in ls] transformed.append(int(any([l.startswith('direct') for l in labels_last_5]))) transformed.append(int(any([l.startswith('indirect') for l in labels_last_5]))) transformed.append(int(any([l.startswith('free_indirect') for l in labels_last_5]))) transformed.append(int(any([l.startswith('reported') for l in labels_last_5]))) # How many labels for each class and overall within the last 10 segments labels_last_10 = [fin_l for ls in [[l for i, l in enumerate(label.split(",")) if i % 3 == 0] for label in backlog[0:10]] for fin_l in ls if fin_l != ""] transformed.append(len([l for l in labels_last_10 if l.startswith('direct')])) transformed.append(len([l for l in labels_last_10 if l.startswith('indirect')])) transformed.append(len([l for l in labels_last_10 if l.startswith('free_indirect')])) transformed.append(len([l for l in labels_last_10 if l.startswith('reported')])) transformed.append(len(labels_last_10)) # --- Other features --- # Segment and character lengths transformed.append(len(token_strings)) transformed.append(len(original_text)) # Segment and character lengths of prev. segment transformed.append(backlog[40]) transformed.append(backlog[41]) # Segment and character lengths of this + prev. segment transformed.append(len(token_strings) + backlog[40]) transformed.append(len(original_text) + backlog[41]) # Is this segment at the start or end of a paragraph? paragraph_end = int("<p>" in original_text) transformed.append(paragraph_end) transformed.append(backlog[42]) # --- Update Backlog --- # [0:10] encode labels of previous ten segments -> updated elsewhere # 10: Colon in prev. segment backlog[10] = colon_this # 11: How many open quotes backlog[11] += open_quote if backlog[11] - close_quote >= 0: backlog[11] -= close_quote else: backlog[11] = 0 # 12: Prev. segment ends with close_quote backlog[12] = int(tags[-1] == "#CLOSE_QUOTE#") # 13: Comma at the end of this segment backlog[13] = comma_end # [14:38] reportin word appearance features prev. segment for i, feature in enumerate([has_rep_word_0, has_rep_word_1, has_rep_word_2, has_rep_word_3, has_rep_word_4, has_rep_word_5, has_rep_word_le_1, has_rep_word_le_2, has_rep_word_le_3, has_rep_word_le_4, has_rep_word_le_5, has_rep_word_noun, has_rep_word_verb, has_spec_rep_word_0, has_spec_rep_word_1, has_spec_rep_word_2, has_spec_rep_word_3, has_spec_rep_word_4, has_spec_rep_word_5, has_spec_rep_word_le_1, has_spec_rep_word_le_2, has_spec_rep_word_le_3, has_spec_rep_word_le_4, has_spec_rep_word_le_5 ]): backlog[14 + i] = feature # 38: Candidate speakers as subject backlog[38] = int(len(subj_cand_speaker) > 0) # 39: Percentage of candidate speakers backlog[39] = num_cand_speaker # 40, 41: lengths of prev. segment backlog[40] = len(token_strings) backlog[41] = len(original_text) # 42: paragraph end backlog[42] = paragraph_end # [43:48]: keep track of pronoun person appearances in the 5 prev. segments backlog[43:47] = backlog[44:48] if per3: if per1: backlog[48] = '3_1' else: backlog[48] = '3' elif per1: backlog[48] = '1' else: backlog[48] = '-' # 49: How many contiguous prev. segments have been in quotes? if in_quotes: backlog[49] += 1 else: backlog[49] = 0 # [50:74] reportin word count features prev. segment for i, feature in enumerate([num_rep_word_0, num_rep_word_1, num_rep_word_2, num_rep_word_3, num_rep_word_4, num_rep_word_5, num_rep_word_le_1, num_rep_word_le_2, num_rep_word_le_3, num_rep_word_le_4, num_rep_word_le_5, num_rep_word_noun, num_rep_word_verb, num_spec_rep_word_0, num_spec_rep_word_1, num_spec_rep_word_2, num_spec_rep_word_3, num_spec_rep_word_4, num_spec_rep_word_5, num_spec_rep_word_le_1, num_spec_rep_word_le_2, num_spec_rep_word_le_3, num_spec_rep_word_le_4, num_spec_rep_word_le_5 ]): backlog[50 + i] = feature return transformed, backlog
match_sd = series.std() match_se = match_sd / sqrt(len(eval_df)) ci_upper = match_mean + 1.96 * match_se ci_lower = match_mean - 1.96 * match_se return match_mean * 100, ci_lower * 100, ci_upper * 100 print("loading data...") eval_df = pd.read_csv('eval_table/eval_table_lemmata.csv') eval_df = eval_df.loc[~eval_df.lemma.isna(), :] print('loaded %d rows' % len(eval_df)) lemmatizer = GermaLemma() eval_df['germalemma'] = eval_df.apply(lambda row: lemmatizer.find_lemma(row[3], row[2]), axis=1) eval_df['match'] = eval_df.lemma == eval_df.germalemma eval_df.head() print('wrong lemmata:') print(eval_df.loc[~eval_df.match, ['token', 'pos', 'lemma', 'germalemma']]) match_mean, ci_lower, ci_upper = get_mean_and_ci(eval_df.match) print('Success rate for germalemma: %.2f%% (95%% CI: [%.2f%%, %.2f%%])' % (match_mean, ci_lower, ci_upper)) eval_df['pattern'] = eval_df.apply(lambda row: lemma_via_patternlib(row[3], row[2]), axis=1) eval_df['match_pattern'] = eval_df.lemma == eval_df.pattern
def comment_to_topic(comment): # load and define stuff lemmatizer = GermaLemma() lemmas = [] remove = [ line.rstrip('\n') for line in open('reviews/add-stopwords.txt', encoding="utf-8") ] stop = stopwords.words('german') exclude_words = remove + stop exclude = { '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' } with open('reviews/nltk_german_classifier_data.pickle', 'rb') as f: tagger = pickle.load(f) # sentence splitting comment = nltk.sent_tokenize(comment) lemmas = [] for j in range(len(comment)): # tokenization comment[j] = nltk.word_tokenize(comment[j]) # punctuation removal comment[j] = [ token for token in comment[j] if token not in exclude and token.isalpha() ] # POS taging comment[j] = tagger.tag(comment[j]) # lemmatization for k in range(len(comment[j])): try: lemmas.append( lemmatizer.find_lemma(comment[j][k][0], comment[j][k][1])) except ValueError: pass # lower lemmas = [word.lower() for word in lemmas] # stopword removal topics = [word for word in lemmas if word not in exclude_words] # make topics html-safe topics_safe = [ t.replace('ä', 'ae').replace('ü', 'ue').replace('ö', 'oe').replace('ß', 'ss') for t in topics ] return topics, topics_safe
""" from processor import TextRank as tcf import re import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem.wordnet import WordNetLemmatizer from germalemma import GermaLemma #Further Pacakges for Preprocessing https://github.com/jfilter/german-preprocessing #Germanlemma: https://github.com/WZBSocialScienceCenter/germalemma #------------------------------Paramterter und Bezeichnung--------------------- gerLem = GermaLemma() lem = WordNetLemmatizer() stem = PorterStemmer() #---------------------------------Funktionen------------------------------------------- def PrePross(ListofSentences,_comma=False, Fuzzy=False, FuzzyRank=False, _reversed = False, Remove_specCar = False, IgnoreWord_list = [None], stem=False, stopwords=[]): ''' Funktion um den Text vorbereiten. Braucht einen Dataframe und den Columnnamen, #indem sich die texte befinden. Im Args: ListofSentences (): Liste mit Textdaten _comma (): Bol - soll
class SentimentDetector: def __init__(self, path: str = "src/data/", windowSize=5) -> None: self.path = path self.windowSize = windowSize self.df_aspect_tokens = None self.df_preprocessed = None self.df_lexicon = None self.lemmatizer = GermaLemma() def downloadLexicon( self, filename: str = "sentiment_lexicon.csv", url: str = "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv", chunk_size: int = 1024, ) -> None: """ Download sentiment lexicon. Args: filename (str, optional): Defaults to "sentimentLexicon.csv". url (str, optional): Defaults to "https://raw.githubusercontent.com/sebastiansauer/pradadata/master/data-raw/germanlex.csv". chunk_size (int, optional): Defines chunk size for downloads of bigger files. Defaults to 128. """ r = requests.get(url, stream=True) file_size = int(r.headers.get("Content-Length", None)) num_bars = NP.ceil(file_size / (chunk_size)) downloadProgress = tqdm(total=num_bars, desc="Downloading Lexicon...", unit="B", unit_scale=True) with open(self.path + filename, "wb") as fd: for chunk in r.iter_content(chunk_size=chunk_size): downloadProgress.update(len(chunk)) fd.write(chunk) downloadProgress.close() def loadCSVs( self, tokenFilename: str = "data_aspects_tokens.csv", preprocessedFilename: str = "data_preprocessed.csv", lexiconFilename: str = "sentiment_lexicon.csv", ) -> bool: """ load all necessary CSV for execution of the detector and set indices as appropriate Args: tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv". preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv". lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv". Returns: bool: successful execution """ try: if self.df_aspect_tokens is None or self.df_aspect_tokens.empty: self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename) self.df_aspect_tokens["polarity_strength"] = PD.NaT self.df_aspect_tokens["polarity_strength"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["sentiment_words"] = PD.NaT self.df_aspect_tokens["sentiment_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["intensifier_words"] = PD.NaT self.df_aspect_tokens["intensifier_words"].fillna( {i: [] for i in self.df_aspect_tokens.index}, inplace=True) self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[ "word_found"].str.replace(r"[^\w]*", "", regex=True) # TODO remove after debugging # self.df_aspect_tokens = self.df_aspect_tokens[:100] if self.df_preprocessed is None or self.df_preprocessed.empty: self.df_preprocessed = PD.read_csv(self.path + preprocessedFilename) # pandas read_csv does not read arrays correctly so we need to adjust those tqdm.pandas(desc="Applying Datatype Transformations....") self.df_preprocessed["tokens"] = self.df_preprocessed[ "tokens"].progress_apply(lambda x: json.loads(x)) if self.df_lexicon is None or self.df_lexicon.empty: if not os.path.exists(self.path + lexiconFilename): self.downloadLexicon() self.df_lexicon = PD.read_csv(self.path + lexiconFilename) self.df_lexicon.drop_duplicates(subset=["word", "qualifier"], inplace=True) self.df_lexicon.set_index("word", inplace=True) self.df_lexicon.drop("%%") return True except IOError as e: print(e) return False def loadSpacyModel( self, model: str = "de_core_news_lg", disableList: list[str] = ["ner", "textcat"], ) -> bool: """ load the spacy model with required modes Args: model (str, optional): name of the mode. Defaults to "de_core_news_sm". disableList (list[str], optional): list of things to be disabled. Defaults to ["tagger", "parser", "ner"]. """ try: self.nlp = spacy.load(model, disable=disableList) return True except OSError: print("Model not found. Attempting to download..") try: spacy.cli.download(model) except Exception as e: print(e) return False self.nlp = spacy.load(model, disable=disableList) return True def checkValidChild(self, child, childType: ChildType) -> bool: if childType == ChildType.DESCRIPTOR: if (child.tag_ == "ADJA" and child.pos_ == "ADJ") or (child.pos_ == "ADV" and child.tag_ == "ADJD"): return True return False elif childType == ChildType.INTENSIFIER: if child.pos_ == "ADJ" or child.pos_ == "ADV": return True return False else: print("Wrong childType.") return False def checkPolarityAdjective(self, child, rowIdx) -> float: """ check if the given word has an entry in the sentiment lexicon and return given polarity strength Args: child (spacy.Token): tokenized word with tagged 'pos_' and 'text' Returns: pol_strength (float): polarity_strength of given word found in sentiment lexicon """ child_normalized = child.text.replace(r"[^\w]*", "") lexEntry = self.checkLexicon(child_normalized) if lexEntry is None: lexEntry = self.checkLexicon(child_normalized.lower()) if lexEntry is None: lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_) lexEntry = self.checkLexicon(lemma) if lexEntry is None: return 1 if type(lexEntry["qualifier"]) == str: pol_strength = lexEntry["polarity_strength"] if lexEntry["qualifier"] == "NEG": return -pol_strength return pol_strength else: for i, qualifier in enumerate(lexEntry["qualifier"].values): if qualifier == "POS": return lexEntry["polarity_strength"][i] if qualifier == "NEG": return -lexEntry["polarity_strength"][i] return 0 def checkLexicon(self, word) -> PD.Series: """ Check for valid lexicon entries return None if not found Args: word (str): word to be use as key Returns: PD.Series: Series that is found for the given key or None """ try: return self.df_lexicon.loc[word] except KeyError: return None def checkForIntensifier(self, child, rowIdx) -> float: """ For a given spacy.Token (child) check if any of the children is an intensifier and if so, return their polarity_strength Args: child (spacy.Token): tokenized word with tagged 'pos_' and 'text' Returns: polarity_multiplier (float): polarity_multiplier of found intensifier word """ child_normalized = child.text.replace(r"[^\w]*", "") # catch words that are not in the sentiment lexicon lexEntry = self.checkLexicon(child_normalized) if lexEntry is None: lexEntry = self.checkLexicon(child_normalized.lower()) if lexEntry is None: lemma = self.lemmatizer.find_lemma(child_normalized, child.pos_) lexEntry = self.checkLexicon(lemma) if lexEntry is None: return 1 if type(lexEntry["qualifier"]) == str: if lexEntry["qualifier"] == "INT": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return lexEntry["polarity_strength"] elif lexEntry["qualifier"] == "SHI": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return -1 else: return 1 else: for i, qualifier in enumerate(lexEntry["qualifier"].values): # TODO currently the first qualifier found is taken, without considering which the most fitting one is if qualifier == "INT": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return lexEntry["polarity_strength"][i] elif qualifier == "SHI": self.df_aspect_tokens["intensifier_words"][rowIdx].append( child.text) return -1 return 1 def calcTotalPolarityStrength(self, child, rowIdx) -> float: """ Calculate the total polarity for a given word Args: child (spacy.Token): the tokenized word with tagged 'pos_' and 'text' Returns: polarity_strength (float): the calculated polarity for the given word (child) """ # lemma = self.lemmatizer.find_lemma(child.text, child.pos_) polarity_strength = self.checkPolarityAdjective(child, rowIdx) # find intensifier in children and multiply their strength to the polarity for c in child.children: if self.checkValidChild(c, ChildType.INTENSIFIER): polarity_strength *= self.checkForIntensifier(c, rowIdx) return polarity_strength def detectSentiment(self, rowDF: PD.Series) -> None: """ Function to start the other relevent functions Args: rowDF (PD.Series): row of the Dataframe """ doc = self.nlp(" ".join(self.df_preprocessed.iloc[ rowDF["reviewnumber"]]["tokens"][rowDF["sent_idx"]])) for child in doc[rowDF["word_idx"]].children: # if child.tag_ == "ADJA": if self.checkValidChild(child, ChildType.DESCRIPTOR): pol_strength = self.calcTotalPolarityStrength( child, rowDF.name) self.df_aspect_tokens["polarity_strength"][rowDF.name].append( pol_strength) self.df_aspect_tokens["sentiment_words"][rowDF.name].append( child.text) return for token in doc[rowDF["word_idx"]].ancestors: if token.pos_ == "AUX" or token.pos_ == "VERB": for child in token.children: if self.checkValidChild(child, ChildType.DESCRIPTOR): pol_strength = self.calcTotalPolarityStrength( child, rowDF.name) self.df_aspect_tokens["polarity_strength"][ rowDF.name].append(pol_strength) self.df_aspect_tokens["sentiment_words"][ rowDF.name].append(child.text) return def convert_polarity(self, qualifier, polarity): sentiment_polarity = [] for i, elem in enumerate(qualifier): if elem == "NEG": sentiment_polarity.append(polarity[i] * -1) else: sentiment_polarity.append(polarity[i]) sentiment_polarity = NP.mean(NP.array(sentiment_polarity)) return sentiment_polarity def createReadableOutput(self, rowDF): appenddict = { "review_number": rowDF["reviewnumber"], "sentiment": self.convert_polarity(rowDF["qualifier"], rowDF["polarity_strength"]), } self.overall_sentiment = self.overall_sentiment.append( appenddict, ignore_index=True) def returnSentimentsforReviews(self) -> PD.DataFrame: self.overall_sentiment = PD.DataFrame( columns=["review_text", "sentiment"]) tqdm.pandas(desc="Calculating Sentiments") self.df_aspect_tokens.progress_apply( lambda x: self.createReadableOutput(x), axis=1) self.overall_sentiment = (self.overall_sentiment.groupby( "review_number").mean().reset_index()) # print(self.overall_sentiment) self.overall_sentiment["review_text"] = self.df_preprocessed[ "text_normalized"][self.overall_sentiment["review_number"].astype( int).tolist()].tolist() return self.overall_sentiment def run(self) -> bool: """ run all basic functions of the detector Returns: bool: successful execution of command """ if not self.loadCSVs(): print("Couldn't load CSV's.") return False if not self.loadSpacyModel(): return true_labels = list() for index, row in self.df_aspect_tokens.iterrows(): true_labels.append(self.df_preprocessed.iloc[row["reviewnumber"]][ self.df_aspect_tokens.iloc[index]["aspect"]]) self.df_aspect_tokens["true_label"] = true_labels tqdm.pandas(desc="Looking up Sentiments...") self.df_aspect_tokens.progress_apply(lambda x: self.detectSentiment(x), axis=1) def saveCSV(self, filename: str = "data_aspects_tokens.csv"): self.df_aspect_tokens["sentiment_words"] = self.df_aspect_tokens[ "sentiment_words"].apply(lambda x: json.dumps(x)) self.df_aspect_tokens.to_csv(self.path + filename, index=False)
def postprocess_spans(row, cl=None): """ Method for better span detection as a postprocessing step after STWR classification. :param row: Each row consists of a label (format:"direct_speech,2,10") and a text. :param cl: label of the positive class instances. :return: The updated label """ label = row.values[0] # Only do postprocessing for detected instances if label == "": return label text = row.values[1] doc = NLP(text) tokens = [token for token in doc] # Get lemmata with germalemma as spacy is not good at this, only possible for pos tags N, V, ADJ, ADV token_lemmata = [] lemmatizer = GermaLemma() for token in tokens: if token.pos_ == "VERB": token_lemmata.append(lemmatizer.find_lemma(token.text, 'V')) elif token.pos_ == "NOUN": token_lemmata.append(lemmatizer.find_lemma(token.text, 'N')) elif token.pos_ in ["ADJ", "ADV"]: token_lemmata.append(lemmatizer.find_lemma(token.text, token.pos_)) else: token_lemmata.append(token.text) # Prepare information only_opening_quotes = [ qu for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu] ] only_closing_quotes = [ QUOTATION_MARKS[qu] for qu in QUOTATION_MARKS.keys() if qu != QUOTATION_MARKS[qu] ] # Do not treat apostrophes as possible quotation marks -> too risky both_quotes = [ qu for qu in QUOTATION_MARKS.keys() if qu == QUOTATION_MARKS[qu] and qu != '\u0027' ] # Find quotation marks that can either be an opening or a closing quote but that don't have the same form as their counter part both = [qu for qu in only_opening_quotes if qu in only_closing_quotes] only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both] only_opening_quotes = [qu for qu in only_opening_quotes if qu not in both] both_quotes = both_quotes + both # Load reporting word list stw_words_all = pd.read_excel("data/stw_words/stw_words_brunner2015.xls") # Only use words with penalty value up tp 3 stw_words_all = stw_words_all[stw_words_all['Penalty'] <= 3] # Some words are only usable for reported class stw_words = stw_words_all[stw_words_all['Marker'] != 'rep'] spans = [] if cl == 'direct': # Search for quotation marks and try to decide whether they signify quoted STWR. Use conservative heuristics. for token in tokens: # Mark different candidates for quotation marks if token.text in only_opening_quotes: token.tag_ = "ONLY_OPENING_QUOTE" elif token.text in only_closing_quotes: token.tag_ = "ONLY_CLOSING_QUOTE" elif token.text in both_quotes: token.tag_ = "BOTH_QUOTES" stack = [] for idx, token in enumerate(tokens): if token.tag_ == "ONLY_OPENING_QUOTE": stack.append((idx, token.text, token.tag_)) elif token.tag_ in ["ONLY_CLOSING_QUOTE", "BOTH_QUOTES"]: # Check whether there is a matching opening quote on the stack found = False for i in range(len(stack) - 1, -1, -1): top = stack[i] if QUOTATION_MARKS[top[1]] == token.text: found = True # Closing quotes are usually preceded by sentence ending punctuation if tokens[idx - 1].tag_ == '$.': spans.append((top[0], idx)) stack = stack[:i] break if not found: # If no opening quotes were found and clear closing quotes are preceded by sentence ending punctuation, # assume everything before is quoted if token.tag_ == "ONLY_CLOSING_QUOTE" and idx > 0 and tokens[ idx - 1].tag_ == '$.': spans.append((0, idx)) # If ambiguous quotation mark is found, decide whether it's opening or closing elif token.tag_ == "BOTH_QUOTES": if idx > 0 and tokens[idx - 1].tag_ == '$.': spans.append((0, idx)) else: stack.append((idx, token.text, token.tag_)) # Check for open quotes in the stack if len(stack) > 0: # Choose first open quote in stack # Opening quotes are usually followed by capital letters (except continuing quotations, these are ignored here) opening = stack[0] if opening[0] < len(tokens) - 2: if tokens[opening[0] + 1].text.istitle(): spans.append((opening[0], len(tokens) - 1)) # In case no quotation marks are there, look for colon if len(spans) == 0: for idx, token in enumerate(tokens): if ":" == token.text: spans.append((idx, len(tokens) - 1)) elif cl == 'indirect': # Following A.B.s directions for annotating indirect representations # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) # Pattern 1: verbal framing phrase + dependent clause - assume max. one of these patterns per segment stw_verb_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if not lemma.istitle() and any(stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))) ] # Only use this pattern if there is a clear candidate if len(stw_verb_segment) == 1: verb = stw_verb_segment[0] dependent_clause = get_children(verb, exception=['sb']) start = None end = None for i, token in enumerate(tokens): if token == verb: start = i elif token in dependent_clause: if start != None: end = i if start != None and end != None: spans.append((start, end)) # Pattern 2: nominal phrase includ. modificators + dependent clause - several of these patterns per segment are possible stw_noun_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if lemma.istitle() and any(stw_words["Word"].str.contains( r'\b{}\b'.format(re.escape(lemma)))) ] for noun in stw_noun_segment: dependent_clause_modif = get_children(noun, exception=[]) all_tokens = dependent_clause_modif + [noun] start = None end = None for i, token in enumerate(tokens): if token in all_tokens: if start == None: start = i else: end = i if start != None and end != None: spans.append((start, end)) # Merge spans merged_spans = [] if len(spans) > 1: for i, span in enumerate(spans): for other in spans: if other == span: continue else: if span[0] >= other[0] and span[1] <= other[1]: break else: merged_spans.append(span) spans = merged_spans elif cl == 'free_indirect': # Free indirect instances are almost always complete sentences -> leave as is pass elif cl == 'reported': # „Prinzipiell wird bei erzählter Wiedergabe angestrebt, den ganzen Satz oder Satzteil zu markieren, der eine Sprach-, Denk- oder Schreibhandlung wiedergibt. # – Wenn es möglich ist, mehrere unterschiedliche sprachliche, schriftliche oder gedankliche Handlungen zu identifizieren, so werden diese jeweils einzeln markiert. # – Wenn eine Nominalphrase mit einem Verb verwendet wird, so dass sich im Ganzen eine Sprach-, Denk- oder Schreibhandlung ergibt, # sollte – wie bei indirekter Wiedergabe – die ganze Verbalphrase markiert werden (also Pläne entwerfen, nicht nur Pläne).“ # Following A.B.s directions for annotating reported representations try to annotate the whole clause for reported instances # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) stw_segment = [ tokens[i] for i, lemma in enumerate(token_lemmata) if any(stw_words_all["Word"].str.contains(r'\b{}\b'.format( re.escape(lemma)))) ] for word in stw_segment: dependent_clause = get_children(word, exception=[]) all_tokens = dependent_clause + [word] start = None end = None for i, token in enumerate(tokens): if token in all_tokens: if start == None: start = i else: end = i if start != None and end != None: spans.append((start, end)) # Don't merge spans as several different reported instance should be labeled separately following A.B.s directions for annotating reported representations # (Annelen Brunner. Automatische Erkennung von Redewiedergabe: ein Beitrag zur quantitativen Narratologie. Vol. 47. Walter de Gruyter, 2015.) # Get character based spans if len(spans) > 0: labels = [] for span in spans: labels.append("{},{},{}".format( cl, tokens[span[0]].idx, (tokens[span[1]].idx + len(tokens[span[1]].text)))) label = ",".join(labels) return label
def __init__(self, pos_prereq): self.pos_prereq = pos_prereq self.lemmatizer = GermaLemma( tiger_corpus= 'resources/tiger_release_aug07.corrected.16012013.conll09')
def __init__(self, sequence_features=True): """ :param sequence_features: If true, use the sequence features (trained on gold labels). """ # Number of features self.num_features = 243 # Names of features - needed for feature inspection self.feature_names = ["perc_pos_NNE", "perc_pos_TRUNC", "perc_pos_APPO", "perc_pos_VVPP", "perc_pos_FM", "perc_pos_KOUI", "perc_pos_ITJ", "perc_pos_PTKANT", "perc_pos_$.", "perc_pos_ADJA", "perc_pos_ADJD", "perc_pos_PTKNEG", "perc_pos_PWS", "perc_pos_PRF", "perc_pos_KOUS", "perc_pos_PDS", "perc_pos_VMINF", "perc_pos_VVIZU", "perc_pos_PPOSS", "perc_pos_VVFIN", "perc_pos_VMFIN", "perc_pos_PROAV", "perc_pos_PRELS", "perc_pos_APPR", "perc_pos_PPOSAT", "perc_pos_APZR", "perc_pos_$,", "perc_pos_PIAT", "perc_pos_VMPP", "perc_pos_NE", "perc_pos__SP", "perc_pos_VAPP", "perc_pos_VAIMP", "perc_pos_CARD", "perc_pos_APPRART", "perc_pos_NN", "perc_pos_KOKOM", "perc_pos_PWAT", "perc_pos_PPER", "perc_pos_XY", "perc_pos_ART", "perc_pos_PWAV", "perc_pos_KON", "perc_pos_PTKA", "perc_pos_VVINF", "perc_pos_$(", "perc_pos_PDAT", "perc_pos_PTKZU", "perc_pos_PRELAT", "perc_pos_PIS", "perc_pos_PTKVZ", "perc_pos_VAINF", "perc_pos_ADV", "perc_pos_VAFIN", "perc_pos_VVIMP", "perc_pos_", "perc_pos_SCONJ", "perc_pos_SYM", "perc_pos_VERB", "perc_pos_X", "perc_pos_EOL", "perc_pos_SPACE", "perc_pos_PUNCT", "perc_pos_ADJ", "perc_pos_ADP", "perc_pos_ADV", "perc_pos_AUX", "perc_pos_CONJ", "perc_pos_CCONJ", "perc_pos_DET", "perc_pos_INTJ", "perc_pos_NOUN", "perc_pos_NUM", "perc_pos_PART", "perc_pos_PRON", "perc_pos_PROPN", "num_ents", "num_PER", "num_LOC", "num_ORG", "num_MISC", "colon", "colon_prev", "comma_end", "perc_emph", "question", "open_quote", "close_quote", "in_quotes", "num_prev_in_quotes", "punct_close_quote", "close_quote_comma", "perc_per1", "perc_per2", "perc_per12", "perc_per3", "only_3_prev_5", "only_1_prev_5", "3_1_prev_5", "has_ind", "has_subj", "no_subj", "no_ind", "has_pres", "has_past", "no_past", "no_pres", "embedded", "wuerden_inf", "wuerden", "has_prep_noun_comp", "has_claus_inf_comp", "subj_cand_speaker", "num_cand_speaker", "prev_subj_cand_speaker", "prev_num_cand_speaker", "has_rep_word_0", "has_rep_word_1", "has_rep_word_2", "has_rep_word_3", "has_rep_word_4", "has_rep_word_5", "has_rep_word_le_1", "has_rep_word_le_2", "has_rep_word_le_3", "has_rep_word_le_4", "has_rep_word_le_5", "has_rep_word_noun", "has_rep_word_verb", "has_spec_rep_word_0", "has_spec_rep_word_1", "has_spec_rep_word_2", "has_spec_rep_word_3", "has_spec_rep_word_4", "has_spec_rep_word_5", "has_spec_rep_word_le_1", "has_spec_rep_word_le_2", "has_spec_rep_word_le_3", "has_spec_rep_word_le_4", "has_spec_rep_word_le_5", "num_rep_word_0", "num_rep_word_1", "num_rep_word_2", "num_rep_word_3", "num_rep_word_4", "num_rep_word_5", "num_rep_word_le_1", "num_rep_word_le_2", "num_rep_word_le_3", "num_rep_word_le_4", "num_rep_word_le_5", "num_rep_word_noun", "num_rep_word_verb", "num_spec_rep_word_0", "num_spec_rep_word_1", "num_spec_rep_word_2", "num_spec_rep_word_3", "num_spec_rep_word_4", "num_spec_rep_word_5", "num_spec_rep_word_le_1", "num_spec_rep_word_le_2", "num_spec_rep_word_le_3", "num_spec_rep_word_le_4", "num_spec_rep_word_le_5", "prev_has_rep_word_0", "prev_has_rep_word_1", "prev_has_rep_word_2", "prev_has_rep_word_3", "prev_has_rep_word_4", "prev_has_rep_word_5", "prev_has_rep_word_le_1", "prev_has_rep_word_le_2", "prev_has_rep_word_le_3", "prev_has_rep_word_le_4", "prev_has_rep_word_le_5", "prev_has_rep_word_noun", "prev_has_rep_word_verb", "prev_has_spec_rep_word_0", "prev_has_spec_rep_word_1", "prev_has_spec_rep_word_2", "prev_has_spec_rep_word_3", "prev_has_spec_rep_word_4", "prev_has_spec_rep_word_5", "prev_has_spec_rep_word_le_1", "prev_has_spec_rep_word_le_2", "prev_has_spec_rep_word_le_3", "prev_has_spec_rep_word_le_4", "prev_has_spec_rep_word_le_5", "prev_num_rep_word_0", "prev_num_rep_word_1", "prev_num_rep_word_2", "prev_num_rep_word_3", "prev_num_rep_word_4", "prev_num_rep_word_5", "prev_num_rep_word_le_1", "prev_num_rep_word_le_2", "prev_num_rep_word_le_3", "prev_num_rep_word_le_4", "prev_num_rep_word_le_5", "prev_num_rep_word_noun", "prev_num_rep_word_verb", "prev_num_spec_rep_word_0", "prev_num_spec_rep_word_1", "prev_num_spec_rep_word_2", "prev_num_spec_rep_word_3", "prev_num_spec_rep_word_4", "prev_num_spec_rep_word_5", "prev_num_spec_rep_word_le_1", "prev_num_spec_rep_word_le_2", "prev_num_spec_rep_word_le_3", "prev_num_spec_rep_word_le_4", "prev_num_spec_rep_word_le_5", "max_sim", "max_sim_rep", "perc_deictic", "spec_conjunct", "perc_modal", "perc_neg", "has_facial", "has_gesture", "has_voice", "repetition", "last_direct", "last_indirect", "last_free_indirect", "last_reported", "last_5_direct", "last_5_indirect", "last_5_free_indirect", "last_5_reported", "last_10_direct", "last_10_indirect", "last_10_free_indirect", "last_10_reported", "num_last_10_reported", "len_tokens", "len_chars", "prev_len_tokens", "prev_len_chars", "sum_len_tokens", "sum_len_chars", "paragraph", "prev_paragraph"] # Switch to turn off sequence features self.sequence_features = sequence_features if not self.sequence_features: self.feature_names = self.feature_names[:-21] + self.feature_names[-8:] # Get all possible tags self.tag_map = sorted(NLP.vocab.morphology.tag_map.keys()) self.pos_map = sorted(spacy.parts_of_speech.NAMES.values()) # Set up lemmatizer self.lemmatizer = GermaLemma() # Set up RFTagger call(["make"], cwd="RFTagger/src") # Load word vectors print("Loading word-vectors. This may take a while ...") self.wordvecs = KeyedVectors.load_word2vec_format("data/word_vecs/kolimo.model", binary=True) print("Done.\n")
class SentiDep: def __init__(self, **kwargs): """ Sentiment-Analyzer for german texts. Get the polarity values of words depending on polarity values of associated descriptive words e.g. 'das schöne Wetter' -> polarity of 'Wetter' == polarity of 'schöne' Purpose: find out in which sentiment context your keywords appear in a text. Note: Works with spacy, nltk and germalemma """ sentiws_path = kwargs.get( 'sentiws_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/sentiws.pickle")) polarity_mod_path = kwargs.get( 'polarity_modifiers_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/polarity_modifiers.pickle")) negations_path = kwargs.get( 'negations_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/negationen_lexicon.pickle")) stts_path = kwargs.get( 'stts_file', os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/stts.pickle")) self.sentiws = pickle.load(open(sentiws_path, 'rb')) self.polarity_modifications = pickle.load(open(polarity_mod_path, 'rb')) self.negations = pickle.load(open(negations_path, 'rb')) self.nlp = spacy.load("de_core_news_md") self.germalemmatizer = GermaLemma() self.stts = pickle.load(open(stts_path, 'rb')) self.german_stops = stopwords.words('german') def tokenize(self, text): """ Tokenize a string using spacy's tokenizer. Input: text/string Output: spacy_doc """ return self.nlp(text) def sentiws_spacy_tag_mapper(self, pos_tag, **kwargs): """ Function for mapping SentiWS POS-tags to spacy POS-tags and reverse. Input: pos_tag, optional: direction -> values: 1 (sentiws to spacy), -1 (spacy to sentiws) -> default: 1 Output: python str """ direction = kwargs.get('direction', 1) senti_map = { "ADJX": "ADJ", "ADV": "ADV", "NN": "NOUN", "VVINF": "VERB" } if direction > 0: return senti_map[pos_tag] elif direction < 0: return {value: key for key, value in senti_map.items()}[pos_tag] def get_polarity(self, word, pos_tag): """ Getter Function for retaining the polarity value by SentiWS for a certain word with POS-tag. Input: word, pos_tag Output: tuple(word, polarity-value, pos_tag) """ senti_words = list( filter( lambda x: x[0] == word and self.sentiws_spacy_tag_mapper(x[2]) == pos_tag, self.sentiws)) if senti_words: senti_words = sorted(senti_words, key=lambda y: y[1]**2, reverse=True)[0] return senti_words def modify_polarity(self, child, polarity): """ Function to consider polarity enhancer and reducer. Input: token.text, token.child.text, token.pos_ (of word) Output: tuple(word, polarity-value, pos_tag) """ senti_word = polarity if senti_word: if child in self.polarity_modifications["polarity_enhancer"]: return (senti_word[0], senti_word[1] * 1.5, senti_word[2]) elif child in self.polarity_modifications["polarity_reducer"]: return (senti_word[0], senti_word[1] * 0.5, senti_word[2]) def easy_switch(self, word): """ Function for finding depending negations without dealing with complex issues. Input: token/word Output: True/False """ neg_search = [ re.search(r'%s' % (n), word) for n in self.negations["negation_regex"] ] neg_search = list(filter(lambda z: z != None, neg_search)) return bool(neg_search) def add_polarities(self, list_of_polarity_tuples): """ Summing up a list of polarity-tuples :param list_of_polarity_tuples: :return: polarity value -> float """ all_pols = [lpt[1] for lpt in list_of_polarity_tuples] return sum(all_pols) def calc_parent_polarity(self, spacy_token, token_polarity, children_polarities): """ Calculating the parent polarity value depending on the children polarities :param spacy_token: :param token_polarity: :param children_polarities: :return: parent_polarity -> tuple(word, polarity, POS-tag) """ if token_polarity and children_polarities: added_children_polarities = self.add_polarities( children_polarities) if added_children_polarities > 0: token_polarity = (spacy_token.text, token_polarity[1] + added_children_polarities, spacy_token.pos_) elif added_children_polarities < 0: token_polarity = (spacy_token.text, (token_polarity[1] + (-1 * added_children_polarities)) * (-1), spacy_token.pos_) elif not token_polarity and children_polarities: token_polarity = (spacy_token.text, self.add_polarities(children_polarities), spacy_token.pos_) return token_polarity def switch_polarity(self, polarity, spacy_doc_sent): """ Switching polarity value depending on negation context of whole sentence. Classic negation (kein, nicht, ...) are recognized as well as negation stops (aber, obwohl, ...) :param polarity: :param spacy_doc_sent: :return: tuple(word, polarity, POS-tag, negation: boolean) """ negation_trigger = False for i, token in enumerate(spacy_doc_sent): for negex in self.negations['negation_regex']: regex = r'%s' % (negex) negation_search = re.search(regex, token.text, re.I) if negation_search: negation_trigger = not negation_trigger if token.lower_ in self.negations['polarity_switches']: if token.text == '.': if token.pos_ == 'PUNCT': negation_trigger = not negation_trigger else: continue else: negation_trigger = not negation_trigger if token.text == polarity[0]: if negation_trigger: negated_polarity = (polarity[0], -polarity[1], polarity[2], "negation: " + str(negation_trigger)) else: negated_polarity = (polarity[0], polarity[1], polarity[2], "negation: " + str(negation_trigger)) return negated_polarity def get_depending_polarities(self, text, keywords): """ Get keyword associated polarity values of german texts. Polarity analysis including polarity reducer/enhancer and negations :param text: :param keywords: :return: Context-polarity value of keywords -> list of tuples """ spacy_doc = self.nlp(text, disable=['ner', 'textcat']) parent_polarities = [] keywords = [k.lower() for k in keywords] for sent in spacy_doc.sents: for i, token in enumerate(sent): token_polarity = self.get_polarity(token.text, token.pos_) children_polarities = [] if token.lower_ in keywords: children = token.children if children: for child in children: child_polarity = self.get_polarity( child.text, child.pos_) if child_polarity: children_polarities.append(child_polarity) parent_polarity = self.calc_parent_polarity( token, token_polarity, children_polarities) if parent_polarity: modified_parent_polarities = [] for child in children: modified_parent_polarities.append( self.modify_polarity(child, parent_polarity)) added_modified_parent_polarity = None if modified_parent_polarities: added_modified_parent_polarity = self.add_polarities( modified_parent_polarities) if added_modified_parent_polarity: added_modified_parent_polarity = ( token.text, added_modified_parent_polarity, token.pos_ + "_modified") parent_polarities.append( self.switch_polarity( added_modified_parent_polarity, sent)) else: parent_polarities.append( self.switch_polarity(parent_polarity, sent)) parent_polarities = [(term.lower(), t_pol, t_pos, neg) for term, t_pol, t_pos, neg in parent_polarities] return parent_polarities def lemmatize(self, spacy_token): """ Lemmatizer using stts-tagset, spacy-token and GermaLemma. Input: spacy token -> german model Output: python str """ tag = spacy_token.tag_ if tag.startswith(('N', 'V', 'ADJ', 'ADV')) and tag in self.stts: return self.germalemmatizer.find_lemma(spacy_token.text, tag) else: return spacy_token.text def generate_topics(self, texts, num_topics=10): """ Generate a list with 30 most frequent nouns in a text. Input: text -> len(text) <= 50000 Output: nltk.FreqDist-object """ tokens = [[token for token in self.tokenize(text)] for text in texts] tokens = [[self.lemmatize(t) for t in token if t.pos_ == 'NOUN'\ and t.lower_ not in self.german_stops] for token in tokens] docs = [" ".join(t) for t in tokens] cv = CountVectorizer(max_df=0.85, max_features=10000) word_count_vector = cv.fit_transform(docs) tf = TfidfTransformer(smooth_idf=True, use_idf=True) tf.fit(word_count_vector) feature_names = cv.get_feature_names() tf_idf_scores = [] for doc in docs: cv_vector = cv.transform([doc]) tf_idf_vector = tf.transform(cv_vector) sorted_items = self.sort_coo(tf_idf_vector.tocoo()) keywords, scores = self.extract_topn_from_vector( feature_names, sorted_items, 10) tf_idf_scores += list(zip(keywords, scores)) tfidf_topics = sorted(tf_idf_scores, key=lambda x: x[1], reverse=False) return dict(tfidf_topics[:num_topics]) def sort_coo(self, coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) def extract_topn_from_vector(self, feature_names, sorted_items, topn=10): sorted_items = sorted_items[:topn] score_vals = [] feature_vals = [] for idx, score in sorted_items: score_vals.append(round(score, 3)) feature_vals.append(feature_names[idx]) results = {} for idx in range(len(feature_vals)): results[feature_vals[idx]] = score_vals[idx] return results, score_vals def create_clinic_polarity_dict(self, key_list, topics): """ Compute polarity scores document-wise :param key_list: list of polarity-scores and document-key -> form: [[polarity-scores_1, document-key_1] ...] -> hint: simple pandas dump with df[[polarity-values, document]].values.tolist() :param topics: list of keywords associated with a certain topic :return: polarities_dict in form: {document_key_1: polarities_1, ...} """ polarities = {} clinic_counter = {} for rl in tqdm(key_list): if not rl[1] in clinic_counter.keys(): clinic_counter[rl[1]] = 1 key = f'{rl[1]}_{clinic_counter[rl[1]]}' polarities[key] = self.get_depending_polarities(rl[0], topics) clinic_counter[rl[1]] += 1 return polarities def create_polarity_df(self, polarities, topics): """ Transforms polarity-scores from 'create_clinic_polarity_dict' output to a formatted pandas dataframe :param polarities: polarities-dict (output from 'create_clinic_polarity_dict') :param topics: list of keywords associated with a certain topic :return: polarity_df (formatted pandas dataframe) of form: columns: keywords/topics rows: document-keys values: float(polarity-scores) or np.nan """ filtered_polarities = [(clinic, polarity) for clinic, polarity in polarities.items() if polarity] columns = {t: [] for t in topics} ids = {"Klinik": []} for clinic, polarity in tqdm(filtered_polarities): ids["Klinik"].append(clinic) row = {t: [] for t in topics} for pol in polarity: row[pol[0].lower()] = pol[1] for word, p in row.items(): if not p: columns[word].append(np.nan) else: columns[word].append(p) for key, value in columns.items(): if len(value) < len(ids["Klinik"]) or len(value) > len( ids["Klinik"]): raise ValueError("Values in dict must have same length!") polarity_df = pd.DataFrame(data=columns, index=ids["Klinik"]) return polarity_df '''