def clf_sentiment(res): res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): # print(i, f'/{len(res.spans)}') viz = [] for span in res.spans[doc]: text = gendocs(doc)[span[0]:span[1]] # texts print(text) viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]]) for v in viz: v['SPAN_SENT'] = 0 sents = text.strip().split('.') for sent in [i for i in sents if i]: s = clf(sent.lower()) print(sent) print(s) print('-----------') t = clf(text) print(t) print() # if any() # result = (s['label'], s['score']) # if result[0] == 'negative': # for v in viz: # v['SPAN_SENT'] = -1 # elif result[0] == 'neutral': # for v in viz: # v['SPAN_SENT'] = 0 # elif result[0] == 'positive': # for v in viz: # v['SPAN_SENT'] = 1 # print(s['label'], s['score']) res_viz.append(viz) return res_viz
def content_analysis(directory, party="all", sample=None, window_size=25, debug=False): if directory != 'test': Path(f"res_ca/{directory}/").mkdir(parents=False, exist_ok=False) doc_labels = load_data(party) if type(sample) == int: doc_labels = random.sample(doc_labels, sample) text = None elif type(sample) == str: doc_labels = ['test'] text = sample elif type(sample) == list: doc_labels = sample text = None else: text = None print("Number of documents: {}".format(len(doc_labels))) print( f"Beginning Content Analysis with parameters: \n party: {party} | samplesize: {sample} | windowsize: {window_size}" ) nlp = spacy.load("de_core_news_lg") ca = ContentAnalysis(nlp, window_size=window_size) entity_recognizer = EntityRecognizer(nlp) sentiment_recognizer = SentimentRecognizer(nlp) sentiws = spaCySentiWS(sentiws_path='sentiws/') # clf = TextClassification(nlp) # nlp.add_pipe(custom_lemma, last=True) nlp.add_pipe(custom_extensions, last=True) nlp.add_pipe(sentiment_recognizer, last=True) nlp.add_pipe(sentiws, last=True) nlp.add_pipe(entity_recognizer, last=True) nlp.add_pipe(ca, last=True) # nlp.add_pipe(clf, last=True) nlp.remove_pipe("ner") labels = [] for label in tqdm(doc_labels): labels.append(label) if text: doc = nlp(text) if debug: for token in doc: print(token.text, token.ent_type_, token._.is_elite_neg, token._.is_attr, token._.is_negated, 'lemma', token._.lemma) else: doc = nlp(gendocs(label)) ca.results.labels.append(label) with open(f'res_ca/{directory}/labels.pkl', 'wb') as f: pickle.dump(labels, f) with open(f'res_ca/{directory}/results_all.pkl', 'wb') as f: pickle.dump(ca.results, f) print(f"Content Analysis complete. \nResults saved in {directory}/...") return ca.results
def visualize(self, label, span=None): """visualize documents with displacy""" row = self.df.loc[self.df['doc'] == label].copy() text = gendocs(label) viz = self.viz[self.labels.index(label)].copy() if span: viz_span = [] for hit in viz: if hit['span_start'] == span[0]: print(hit) # hit['start'] -= span[0] hit['end'] -= span[0] hit['label'] = f"{hit['label']} | {hit['score']:.2f}" viz_span.append(hit) ex = [{ "text": text[span[0]:span[1]], "ents": viz_span, "title": f"{row['doc'][0]} | {row.name_res[0]} ({row['party'][0]}) | {row['date'][0].strftime('%d/%m/%Y')}", }] all_ents = {i["label"] for i in viz_span} # print(ex) else: for hit in viz: hit['label'] = f"{hit['label']} | {hit['score']:.2f}" ex = [{ "text": text, "ents": viz, "title": f"{row['doc'][0]} | {row.name_res[0]} ({row['party'][0]}) | {row['date'][0].strftime('%d/%m/%Y')}", }] # find unique labels for coloring options all_ents = {i["label"] for i in viz} options = {"ents": all_ents, "colors": dict()} for ent in all_ents: if ent.startswith("E"): options["colors"][ent] = "coral" if ent.startswith("V"): options["colors"][ent] = "lightgrey" if ent.startswith("P"): options["colors"][ent] = "yellow" displacy.render(ex, style="ent", manual=True, jupyter=True, options=options)
def coding(res): res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): if i % 500 == 0: print(i, f'/{len(res.spans)}') doc_viz = [] doc_vizs = Results.filter_viz(doc_vizs, on='start') for span in res.spans[doc]: viz = [] text = gendocs(doc)[span[0]:span[1]] viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]]) # final coding pop_hits_v = 0 pop_hits_e = 0 for v in viz: v['TOK_IS_POP'] = False v['SPAN_IS_POP'] = False if v['RLY_GER'] and (v['RLY_V'] == True or v['RLY_E'] == True): v['TOK_IS_POP'] = True if v['TOK_IS_POP'] and v['coding'] == 'V': pop_hits_v += 1 for attr in viz: if attr['attr_of'] == v['start']: attr['RLY_V'] = True attr['TOK_IS_POP'] = True if v['TOK_IS_POP'] and (v['coding'] == 'E' or (v['coding'] == 'EA' and v['pos'] == 'NOUN')): pop_hits_e += 1 for attr in viz: if attr['attr_of'] == v['start']: attr['RLY_E'] = True attr['TOK_IS_POP'] = True if pop_hits_v > 0 and pop_hits_e > 0: for v in viz: v['SPAN_IS_POP'] = True doc_viz.extend(viz) res_viz.append(doc_viz) return res_viz
def viz_id(df, id): ContentAnalysis.viz(gendocs(id), df.loc[df['doc'] == id])
def clf_pop(res): # clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1) res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): if i % 500 == 0: print(i, f'/{len(res.spans)}') doc_viz = [] doc_vizs = Results.filter_viz(doc_vizs, on='start') for span in res.spans[doc]: viz = [] text = gendocs(doc)[span[0]:span[1]] viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0]]) for v in viz: v['RLY_GER'] = True v['RLY_V'] = False v['RLY_E'] = False v['RLY_REASON'] = set() # 1. check if text is ger hypothesis_template = 'Der Text handelt von {}' candidate_labels = ['Deutschland', 'Europa', 'Ausland'] s = clf(text, candidate_labels, hypothesis_template, multi_class=False) if s['labels'][0] == 'Ausland' and s['scores'][0] >= 0.9: for v in viz: v['RLY_GER'] = False # 2. check if volk is benachteiligt: hypothesis_template = '{} hat Nachteile' candidate_labels = [] for v in viz: if v['coding'] == 'V': candidate_labels.append(v['lemma']) if hypothesis_template and candidate_labels: s = clf(text, candidate_labels, hypothesis_template, multi_class=True) candidates_people = [] for j, label in enumerate(s['labels']): if s['scores'][j] >= 0.75: candidates_people.append(label) for v in viz: if v['lemma'] == label: v['RLY_V'] = True # 3. check if elite benachteiligt volk: for volk in candidates_people: h0 = '{} benachteiligt ' + volk h1 = '{} entmachtet ' + volk h2 = '{} betrügt ' + volk # h3 = '{} belügt ' + volk candidate_labels = [] for v in viz: if v['coding'] == 'E': candidate_labels.append(v['lemma']) hs = [h0, h1, h2] for h, hypothesis_template in enumerate(hs): if candidate_labels: # print(hypothesis_template) s = clf(text, candidate_labels, hypothesis_template, multi_class=True) for j, label in enumerate(s['labels']): if s['scores'][j] >= 0.75: for v in viz: if v['lemma'] == label: v['RLY_E'] = True v['RLY_REASON'].add(h) # final coding pop_hits_v = 0 pop_hits_e = 0 for v in viz: v['TOK_IS_POP'] = False v['SPAN_IS_POP'] = False if v['RLY_GER'] and (v['RLY_V'] == True or v['RLY_E'] == True): v['TOK_IS_POP'] = True if v['TOK_IS_POP'] and v['coding'] == 'V': pop_hits_v += 1 for attr in viz: if attr['attr_of'] == v['start']: attr['RLY_V'] = True attr['TOK_IS_POP'] = True if v['TOK_IS_POP'] and (v['coding'] == 'E' or (v['coding'] == 'EA' and v['pos'] == 'NOUN')): pop_hits_e += 1 for attr in viz: if attr['attr_of'] == v['start']: attr['RLY_E'] = True attr['TOK_IS_POP'] = True if pop_hits_v > 0 and pop_hits_e > 0: for v in viz: v['SPAN_IS_POP'] = True # print(viz) doc_viz.extend(viz) res_viz.append(doc_viz) return res_viz
import pickle res = pickle.load(open("res_ca/test/results_all.pkl", "rb")) res.set_entities() res.compute_score() res.create_df() res.add_meta_plenar() # display(res.df.groupby('party').mean()) res.compute_score_spans() # res.visualize('plenar_029688', span=(3788, 4288)) #%% for i, (doc, _) in enumerate(zip(res.spans, res.viz)): # print(doc, _) for span in res.spans[doc]: # decisions of transformers s = clf(gendocs(doc)[span[0]:span[1]])[0] print(gendocs(doc)[span[0]:span[1]]) print(s['label'], s['score']) for v in res.viz[i]: if v['span_start'] == span[0]: v['span_sent'] = s['label'] v['span_sent_score'] = s['score'] # %% class ContentAnalysis(): def __init__(self, model): self.nlp = spacy.load(model) self.clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1)
def clf_demo(clf, res, debug=False): res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): if i % 500 == 0: print(i, f'/{len(res.spans)}') doc_viz = [] # doc_vizs = Results.filter_viz(doc_vizs, on='start') for span in res.spans[doc]: viz = [] text = gendocs(doc)[span[0]:span[1]] # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['span_end'] == span[1]]) # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400]) viz.extend([ viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['span_end'] == span[1] and viz['RLY_GER'] ]) checked_history = False is_present = True demo = [ 'Demokratie', 'Gewaltenteilung', 'Gerechtigkeit', 'Meinungsfreiheit' ] for w in demo: if w in text: if not checked_history: hypothesis_template = 'Der Text beschreibt {}' candidate_labels = [ 'Geschichte', 'Nationalsozialismus' ] s = clf(text, candidate_labels, hypothesis_template, multi_class=True) if debug: print(s) if any(i > 0.75 for i in s['scores']): is_present = False checked_history = True if is_present: # REASON IS S hypothesis_template = 'In Deutschland herrscht keine {}' candidate_labels = [w] s = clf(text, candidate_labels, hypothesis_template, multi_class=True) if s['scores'][0] > 0.75: for v in viz: if v['coding'].startswith('E'): v['RLY_E'] = True v['RLY_REASON'].add('S') elif v['coding'].startswith('V'): v['RLY_V'] = True v['RLY_REASON'].add('S') if debug: pprint(hypothesis_template) pprint(s) doc_viz.extend(viz) res_viz.append(doc_viz) return res_viz
def clf_pop(clf, res, debug=False): # clf = pipeline("zero-shot-classification", model='joeddav/xlm-roberta-large-xnli', device=-1) res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): if i % 500 == 0: print(i, f'/{len(res.spans)}') doc_viz = [] # doc_vizs = Results.filter_viz(doc_vizs, on='start') for span in res.spans[doc]: viz = [] text = gendocs(doc)[span[0]:span[1]] # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400] and viz['RLY_GER']) viz.extend([ viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['span_end'] == span[1] and viz['RLY_GER'] ]) for v in viz: v['RLY_V'] = False v['RLY_E'] = False v['RLY_REASON'] = set() # 2. check if volk is benachteiligt: condition = False while not condition: h0 = '{} hat Nachteile' # h1 = 'ungerecht für {}' candidate_labels = set() for v in viz: if v['coding'] == 'V': candidate_labels.add(v['lemma']) candidate_labels = list(candidate_labels) hs = [h0] for h, hypothesis_template in enumerate(hs): if hypothesis_template and candidate_labels: s = clf(text, candidate_labels, hypothesis_template, multi_class=True) candidates_people = [] for j, label in enumerate(s['labels']): if s['scores'][j] >= 0.75: candidates_people.append(label) for v in viz: if v['lemma'] == label: v['RLY_V'] = True v['RLY_REASON'].add(h) condition = True if debug: pprint(hypothesis_template) pprint(s) condition = True # 3. check if elite benachteiligt volk: for volk in candidates_people: condition = False while not condition: h0 = '{} benachteiligt ' + volk h1 = '{} entmachtet ' + volk h2 = '{} betrügt ' + volk # h3 = '{} belügt ' + volk candidate_labels = set() for v in viz: if v['coding'] == 'E' or (v['coding'] == 'EA' and v['pos'] == 'NOUN'): candidate_labels.add(v['lemma']) candidate_labels = list(candidate_labels) hs = [h0, h1, h2] for h, hypothesis_template in enumerate(hs): if candidate_labels: s = clf(text, candidate_labels, hypothesis_template, multi_class=True) for j, label in enumerate(s['labels']): if s['scores'][j] >= 0.75: for v in viz: if v['lemma'] == label: v['RLY_E'] = True v['RLY_REASON'].add(h) condition = True if debug: pprint(hypothesis_template) pprint(s) condition = True doc_viz.extend(viz) res_viz.append(doc_viz) return res_viz
def clf_ger(clf, res, debug=False): res_viz = [] for i, (doc, doc_vizs) in enumerate(zip(res.spans, res.viz)): if i % 500 == 0: print(i, f'/{len(res.spans)}') doc_viz = [] seen_span = set() # doc_vizs = Results.filter_viz(doc_vizs, on='start') for span in res.spans[doc]: viz = [] span_id = (span[0], span[1]) text = gendocs(doc)[span[0]:span[1]] # viz.extend([viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['start'] - viz['span_start'] <= 2_400]) if span_id not in seen_span: viz.extend([ viz for viz in doc_vizs if viz['span_start'] == span[0] and viz['span_end'] == span[1] ]) seen_span.add(span_id) for v in viz: v['RLY_GER'] = True # if viz: # 1. check if text is ger hypothesis_template = 'Der Text handelt von {}' candidate_labels = ['Deutschland', 'Europa', 'Ausland'] s = clf(text, candidate_labels, hypothesis_template, multi_class=True) # if s['labels'][0] == 'Ausland' and s['scores'][0] > 0.5: id_ausland = s['labels'].index('Ausland') id_ger = s['labels'].index('Deutschland') if s['labels'][ -1] == 'Deutschland' and s['scores'][id_ausland] > 0.5: for v in viz: v['RLY_GER'] = False elif s['labels'][0] == 'Ausland' and s['scores'][id_ausland] / s[ 'scores'][id_ger] > 2: for v in viz: v['RLY_GER'] = False ###################################### # 1. check if text is ger v2: # hypothesis_template = 'Der Text beschreibt {}' # candidate_labels = ['Deutschland', 'Ausland'] # s = clf(text, candidate_labels, hypothesis_template, multi_class=False) # if s['labels'][0] == 'Ausland' and s['scores'][0] >= 0.9: # for v in viz: # v['RLY_GER'] = False ##################################### if debug: pprint(span_id) pprint(hypothesis_template) pprint(s) doc_viz.extend(viz) res_viz.append(doc_viz) return res_viz