def update_mdbs_from_crawler(file: Path): try: if file: persons = json.loads(open(file.absolute(), "r").read()) else: persons = database.get_crawler_db()["person"].find({}) except: raise ConnectionError( "Can't connect to remote crawler db. If you're developing locally you must specify a equivalent " "json with --file as fallback.") for p in persons: memberships = [] for timeframe in p["fraktionen"]: austrittsdatum = None if 'austrittsDatum' in timeframe: austrittsdatum = get_safe_datetime(timeframe['austrittsDatum']) eintrittsdatum = get_safe_datetime(timeframe['eintrittsDatum']) membership = (eintrittsdatum, austrittsdatum, Faction.from_mdb_description(timeframe["beschreibung"])) memberships.append(membership) # will auto create MDB if not yet existent MDB.find_and_add_in_storage(p['vorname'], p['nachname'], memberships, p['_id'], get_safe_datetime(p['geburtsdatum']), p['geburtsort'], p['titel'], p['beruf'], initial=True, created_by="init")
def retrieve_paragraph_keymap(add_debug_obj: bool = False): # fetch person list from mdb database person_keymap = {} mdb_list = MDB.find_known_mdbs() ignored_mdbs = [] if len(mdb_list) > 0: for mdb in mdb_list: keyword = mdb['surname'] # TODO disambiguation improvement # for now, we opt to look for # mdb references only by their surname, as we have no method # to contextualize role- or forename references enough to tell # who's been adressed. Even in this solution, we discard any names # that appear multiple times in our database, as we again have no # system in place to figure out which entity is meant. if keyword not in person_keymap.keys(): if keyword not in ignored_mdbs: person_keymap[keyword] = keyword else: person_keymap.pop(keyword) ignored_mdbs.append(keyword) for k in person_keymap.keys(): people = database.find_many("mdb", {"surname": k}) if not len(people) > 1: person_keymap[k] = MDB(**people[0]) return person_keymap
def manual_import(args): if args.dry_run: MDB.set_storage_mode("runtime") files = [] for file in args.files: if file.is_file(): files.append(file) elif file.is_dir(): for sub_file in list(file.iterdir()): if sub_file.is_file(): files.append(sub_file) for file in files: logger.info("reading \"{}\" now...".format(file.as_posix())) transcripts = list() if file.suffix.lower() == ".json": logger.info("reading json based transcript file now...") file_content = read_transcripts_json_file(file) else: logger.info("reading xml based transcript file now...") file_content = [read_transcript_xml_file(file)] logger.info("extracting communication model now...".format(file.as_posix())) for metadata, inter_candidates in file_content: transcript = Transcript.from_interactions( metadata=metadata, interactions=extract_communication_model( candidates=inter_candidates, add_debug_objects=args.add_debug_objects)) # insert into DB if not args.dry_run: transcript_dict = transcript.dict(exclude_none=True, exclude_unset=True) logger.info(f"writing transcript with '{len(transcript_dict['interactions'])}' interactions into db.") database.update_one("session", {"session_id": transcript.session_no}, transcript_dict) transcripts.append(transcript) # notify sentiment group if args.notify and transcript: utils.notify_sentiment_analysis_group([str(transcript.session_no)]) cm = CommunicationModel(transcripts=transcripts) if args.dry_run: out_file: Path = file.with_suffix(".converted.json") logger.info("writing transcripts into {}.".format(out_file.absolute().as_posix())) with open(out_file, "w", encoding="utf-8") as o: o.write(cm.json(exclude_none=True, indent=4, ensure_ascii=False)) with open(out_file.parent / "mdb.json", "w", encoding="utf-8") as o: safe_json_dump(MDB._mdb_runtime_storage, o)
def _build_candidate(comment: str) -> InteractionCandidate: return InteractionCandidate(speaker=MDB.find_and_add_in_storage( forename="Likey", surname="McUnittest", memberships=[(datetime.min, None, Faction.NONE)]), paragraph="Unittest", comment=comment)
def test_extract_funny_sample_4(self): comment = "(Heiterkeit des Abg. Manfred Grund [CDU/CSU])" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] self.assertEqual( interaction_0.sender, MDB.find_and_add_in_storage(forename="Manfred", surname="Grund", memberships=[(datetime.min, None, Faction.CDU_AND_CSU)])) self.assertEqual(interaction_0.message, 'Heiterkeit des Abg. Manfred Grund [CDU/CSU]')
def _get_candidates(topic_points: List[Dict], speaker_map: Dict[str, MDB]) -> List[InteractionCandidate]: candidates = list() not_in_speaker_list = list() for tp in topic_points: if tp["ablaufTyp"].lower() not in ["sitzungsbeginn", "tagesordnungspunkt"]: continue last_paragraph = None speeches = tp.get("reden", list()) for sp in speeches: # why is this not in all objects? if "redeInhalt" not in sp: continue speaker = speaker_map.get(sp["rednerId"]) if not speaker: # try to get speaker through mdb_number from DB speaker = database.find_one('mdb', {'mdb_number': sp["rednerId"]}) if not speaker: if sp['rednerId'] not in not_in_speaker_list: not_in_speaker_list.append(sp['rednerId']) continue speaker = MDB(**speaker) speaker_map[sp['rednerId']] = speaker for sp_part in sp["redeInhalt"]: part_type = sp_part["typ"] if last_paragraph is not None and part_type.lower() == "paragraf": candidates.append(InteractionCandidate( speaker=speaker, paragraph=utils.cleanup_str(last_paragraph), comment=None)) last_paragraph = sp_part["text"] elif part_type.lower() == "kommentar": if last_paragraph and speaker: candidates.append(InteractionCandidate( speaker=speaker, paragraph=utils.cleanup_str(last_paragraph), comment=utils.cleanup_str(sp_part["text"]))) last_paragraph = None else: last_paragraph = sp_part["text"] logger.warning(f"Following speakers were not in the speaker list: {not_in_speaker_list}") return candidates
def test_extract_sample3(self): comment3 = "(Carsten Schneider [Erfurt] [SPD]: Was für ein Blödsinn! – Zuruf vom BÜNDNIS90/DIE GRÜNEN: Vielleicht mal lesen! Lesen bildet!)" cm3 = extract_communication_model([_build_candidate(comment3)]) interaction_0 = cm3[0] interaction_1 = cm3[1] self.assertEqual( interaction_0.sender, MDB.find_and_add_in_storage(forename="Carsten", surname="Schneider", memberships=[(datetime.min, None, Faction.SPD)])) self.assertEqual(interaction_0.message, 'Was für ein Blödsinn!') self.assertEqual(interaction_1.sender, Faction.DIE_GRÜNEN) self.assertEqual(interaction_1.message, 'Vielleicht mal lesen! Lesen bildet!')
def test_extract_sample2(self): comment = "(Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN – Zuruf des Abg. Armin-Paulus Hampel [AfD])" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] interaction_1 = cm[1] interaction_2 = cm[2] interaction_3 = cm[3] interaction_4 = cm[4] interaction_5 = cm[5] self.assertEqual(interaction_0.sender, Faction.CDU_AND_CSU) self.assertEqual( interaction_0.message, 'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN' ) self.assertEqual(interaction_1.sender, Faction.SPD) self.assertEqual( interaction_1.message, 'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN' ) self.assertEqual(interaction_2.sender, Faction.FDP) self.assertEqual( interaction_2.message, 'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN' ) self.assertEqual(interaction_3.sender, Faction.DIE_LINKE) self.assertEqual( interaction_3.message, 'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN' ) self.assertEqual(interaction_4.sender, Faction.DIE_GRÜNEN) self.assertEqual( interaction_4.message, 'Beifall bei der CDU/CSU, der SPD, der FDP, der LINKEN und dem BÜNDNIS 90/DIE GRÜNEN' ) self.assertEqual( interaction_5.sender, MDB.find_and_add_in_storage(forename="Armin-Paulus", surname="Hampel", memberships=[(datetime.min, None, Faction.AFD)])) self.assertEqual(interaction_5.message, 'Zuruf des Abg. Armin-Paulus Hampel [AfD]')
def _convert_speaker(speaker_map: Dict[str, Dict]): def _fix_factions(factions) -> List[Tuple[datetime, datetime, Faction]]: fixed_factions = list() for f in factions: austrittsdatum = None if 'austrittsDatum' in f: austrittsdatum = f["austrittsDatum"] if not isinstance(f["austrittsDatum"], datetime): austrittsdatum = datetime.fromisoformat(f["austrittsDatum"]) eintrittsdatum = f["eintrittsDatum"] if not isinstance(f["eintrittsDatum"], datetime): eintrittsdatum = datetime.fromisoformat(f["eintrittsDatum"]) fixed_factions.append(( eintrittsdatum, austrittsdatum, Faction.from_mdb_description(f["beschreibung"]).value)) return fixed_factions conv_map = dict() for k, v in speaker_map.items(): birthday = v.get("geburtsdatum") if isinstance(v, str): birthday = datetime.fromisoformat(birthday) conv_map[v["_id"]] = MDB.find_and_add_in_storage( mdb_number=v["_id"], forename=utils.cleanup_str(v["vorname"]), surname=utils.cleanup_str(v["nachname"]), memberships=_fix_factions(v.get("fraktionen", list())), birthday=birthday, birthplace=utils.cleanup_str(v.get("geburtsort")), title=utils.cleanup_str(v.get("title")), job_title=utils.cleanup_str(v.get("beruf", "")), created_by="jsonParse") return conv_map
def test_extract_funny_sample_3(self): comment = "(Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP])" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] interaction_1 = cm[1] self.assertEqual(interaction_0.sender, Faction.CDU_AND_CSU) self.assertEqual( interaction_0.message, 'Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP]' ) self.assertEqual( interaction_1.sender, MDB.find_and_add_in_storage(forename="Jens", surname="Beeck", memberships=[(datetime.min, None, Faction.FDP)])) self.assertEqual( interaction_1.message, 'Heiterkeit und Beifall bei der CDU/CSU sowie des Abg. Jens Beeck [FDP]' )
def test_extract_funny_sample2(self): comment = "(Beifall bei der SPD sowie bei Abgeordneten der LINKEN – Matthias W. Birkwald [DIE LINKE]: Ich mich auch!)" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] interaction_1 = cm[1] interaction_2 = cm[2] self.assertEqual(interaction_0.sender, Faction.SPD) self.assertEqual( interaction_0.message, 'Beifall bei der SPD sowie bei Abgeordneten der LINKEN') self.assertEqual(interaction_1.sender, Faction.DIE_LINKE) self.assertEqual( interaction_1.message, 'Beifall bei der SPD sowie bei Abgeordneten der LINKEN') self.assertEqual( interaction_2.sender, MDB.find_and_add_in_storage(forename="Matthias W.", surname="Birkwald", memberships=[(datetime.min, None, Faction.DIE_LINKE)])) self.assertEqual(interaction_2.message, 'Ich mich auch!')
def test_extract_sample1(self): comment = "(Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und des BÜNDNISSES 90/DIE GRÜNEN – Dr. Eberhardt Alexander Gauland [AfD]: Ha, ha, ha!)" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] interaction_1 = cm[1] interaction_2 = cm[2] interaction_3 = cm[3] interaction_4 = cm[4] self.assertEqual(interaction_0.sender, Faction.FDP) self.assertEqual( interaction_0.message, 'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und ' 'des BÜNDNISSES 90/DIE GRÜNEN') self.assertEqual(interaction_1.sender, Faction.CDU_AND_CSU) self.assertEqual( interaction_1.message, 'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und ' 'des BÜNDNISSES 90/DIE GRÜNEN') self.assertEqual(interaction_2.sender, Faction.SPD) self.assertEqual( interaction_2.message, 'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und ' 'des BÜNDNISSES 90/DIE GRÜNEN') self.assertEqual(interaction_3.sender, Faction.DIE_GRÜNEN) self.assertEqual( interaction_3.message, 'Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und ' 'des BÜNDNISSES 90/DIE GRÜNEN') self.assertEqual( interaction_4.sender, MDB.find_and_add_in_storage(forename="Eberhardt Alexander", surname="Gauland", memberships=[(datetime.min, None, Faction.AFD)])) self.assertEqual(interaction_4.message, 'Ha, ha, ha!')
def _build_mdb(person_str, add_debug_obj): # the following lines are a workaround for the somehow not working # optional matching group for the Abg. string. If someone finds a way to # get this optional matching group working feel free to remove also # remove the following lines cut_idx = person_str.find("Abg.") if cut_idx >= 0: cut_idx = person_str.find(" ", cut_idx) person_str = person_str[cut_idx:].strip() person_str = person_str.replace("(", "[") person_str = person_str.replace(")", "]") num_opening_brackets = person_str.count("[") num_closing_brackets = person_str.count("]") if num_opening_brackets != num_closing_brackets: logger.warning( "the received person_str \"{}\" contains not the same amount of " "opening brackets as closing brackets. this might become a " "problem shortly after this...".format(person_str)) if num_opening_brackets > num_closing_brackets: person_str = person_str.lstrip("[") work_str = person_str person_parts = list() metadata_parts = list() while "[" in work_str: start_idx = work_str.find("[") end_idx = work_str.find("]", start_idx) + 1 person_parts.append(work_str[:start_idx].strip()) metadata_parts.append(work_str[start_idx:end_idx].strip().strip("[]")) work_str = work_str[end_idx:] full_name = " ".join(person_parts) faction = "" for mp in metadata_parts: found_factions = Faction.in_text(mp) if found_factions: if len(found_factions) != 1: logger.info(f"Found factions != 1: {found_factions}") assert len(found_factions) == 1 faction = found_factions[0] break membership = list() if faction: membership = [(datetime.min, None, faction)] full_name = full_name.replace("- ", "-") full_name = full_name.replace(" -", "-") role, title, forename, surname = split_name_str(full_name) # detection of malformed extractions (will later remove interactions with malformed MDB) # check that forename and surname are filled and have more than one char in them malformed = not forename or len(forename) <= 1 malformed = malformed or not surname or len(surname) <= 1 # check if forename starts with a small char malformed = malformed or forename[0].islower() if not malformed: extended_keywords = keywords.copy() extended_keywords.update([ "am", "um", "ne", "wo", "Wo", ".", "-", "der", "die", "das", "des", "von", "an", "h", "h." ]) for k in extended_keywords: malformed = malformed or k in full_name.split(" ") malformed = malformed or k == forename.lower( ) or k == surname.lower() if malformed: break if malformed: return MalformedMDB(person_str, forename, surname, membership) debug_info = None if add_debug_obj: debug_info = { "constructed_from_text": True, "creation_person_str": person_str } return MDB.find_and_add_in_storage(forename=forename, surname=surname, memberships=membership, job_title=role, debug_info=debug_info, created_by="_buildMdb")
import unittest from datetime import datetime from cme.domain import InteractionCandidate, MDB, Faction from cme.extraction import extract_communication_model MDB.set_storage_mode("runtime") def _build_candidate(comment: str) -> InteractionCandidate: return InteractionCandidate(speaker=MDB.find_and_add_in_storage( forename="Likey", surname="McUnittest", memberships=[(datetime.min, None, Faction.NONE)]), paragraph="Unittest", comment=comment) class TestExtraction(unittest.TestCase): def test_extract_sample1(self): comment = "(Beifall bei der FDP sowie bei Abgeordneten der CDU/CSU, der SPD und des BÜNDNISSES 90/DIE GRÜNEN – Dr. Eberhardt Alexander Gauland [AfD]: Ha, ha, ha!)" cm = extract_communication_model([_build_candidate(comment)]) interaction_0 = cm[0] interaction_1 = cm[1] interaction_2 = cm[2] interaction_3 = cm[3] interaction_4 = cm[4] self.assertEqual(interaction_0.sender, Faction.FDP) self.assertEqual( interaction_0.message,
def _extract( block_el: bs4e.Tag, curr_speaker: MDB = None, curr_paragraph: str = None) \ -> List[InteractionCandidate]: pms = list() for el in block_el: # there are random line breaks in the file which BeautifulSoup # makes accessible but we don't need if isinstance(el, bs4e.NavigableString): continue elif el.name == "name" or (el.name == "p" and el.get("klasse") == "N"): role, title, first_name, last_name = split_name_str(cleanup_str(el.getText().rstrip(":"))) curr_speaker = { "forename": cleanup_str(first_name), "surname": cleanup_str(last_name), "memberships": [(datetime.min, None, Faction.NONE)], "job_title": role, "title": title } elif el.name == "rede": pms += _extract(el, curr_speaker, curr_paragraph) elif el.name == "p": category = el.get("klasse") if category == "redner": # workaround for the situation in which the fraktion tags in # the xml somehow contain a direct speech formatted like this "SPD: ja." faction_txt = _safe_get_text(el.redner, "fraktion") if ":" in faction_txt: faction_txt = faction_txt.split(":")[0].strip() # TODO: Proper name and integrate into find_in_storage curr_speaker = { "mdb_number": el.redner.get("id"), "forename": _safe_get_text(el.redner, "vorname"), "surname": _safe_get_text(el.redner, "nachname"), "memberships": [(datetime.min, None, Faction.from_name(faction_txt))], "job_title": _safe_get_text(el.redner, "rolle_lang")} elif category in ["J", "J_1", "O", "Z"]: new_para_str = cleanup_str(el.getText()) if curr_paragraph is not None: if not curr_speaker: # logger.warning( # "found a new paragraph but couldn't finish " # "the old one as there has been no speaker so " # "far! dropping the old one (\"{}\") now...".format(curr_paragraph)) curr_paragraph = new_para_str continue speaker = curr_speaker if isinstance(curr_speaker, MDB) \ else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser") pms.append(InteractionCandidate( speaker=speaker, paragraph=curr_paragraph, comment=None)) curr_paragraph = new_para_str else: logger.debug("Ignoring unhandled category \"{}\" of tag " "p.".format(category)) elif el.name == "kommentar": if not curr_speaker: if logging_is_needed(el.getText()): logger.warning( "found a comment but there has been no speaker so far" "! skipping it (\"{}\") until we find a speaker...".format( cleanup_str(el.getText()))) continue if not curr_paragraph: logger.warning( "found a comment but there has been no paragraph so far" "! skipping it (\"{}\") until we find a paragraph...".format( cleanup_str(el.getText()))) continue speaker = curr_speaker if isinstance(curr_speaker, MDB) \ else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser") pms.append(InteractionCandidate( speaker=speaker, paragraph=curr_paragraph, comment=cleanup_str(el.getText()))) curr_paragraph = None # finish still open curr_paragraph if curr_paragraph is not None: if not curr_speaker: logger.warning( "found a open paragraph but there has been no speaker so far" "! skipping it (\"{}\"), but this should be investigated as it " "means no speaker in the whole block has been found".format( cleanup_str(curr_paragraph))) return pms speaker = curr_speaker if isinstance(curr_speaker, MDB) \ else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser") pms.append(InteractionCandidate( speaker=speaker, paragraph=curr_paragraph, comment=None)) return pms