def test_people_with_role(self):
        candidates = [
            ("Präsident Dr. Manfred Jürgenson von Kuchenhausen", "Präsident", "Dr.", "Manfred", "von Kuchenhausen")
        ]

        for cand in candidates:
            res = split_name_str(cand[0])
            self.assertTupleEqual(res, cand[1:])
    def test_people_with_default_structure(self):
        candidates = [
            ("Vorname Nachname", "", "", "Vorname", "Nachname")
        ]

        for cand in candidates:
            res = split_name_str(cand[0])
            self.assertTupleEqual(res, cand[1:])
    def test_people_with_titles(self):
        candidates = [
            ("Dr. Test Senior", "", "Dr.", "Test", "Senior"),
            ("Dr. h. c. Thomas Sattelberger", "", "Dr. h. c.", "Thomas", "Sattelberger"),
            ("Dr. Dr. h. c. Karl A. Lamers", "", "Dr. Dr. h. c.", "Karl", "Lamers"),
            ("B.Sc. Vorname Nachname", "", "B.Sc.", "Vorname", "Nachname"),
            ("Dr. h. c. Dr. Ing. e. h. Vorname Nachname", "", "Dr. h. c. Dr. Ing. e. h.", "Vorname", "Nachname"),
        ]

        for cand in candidates:
            res = split_name_str(cand[0])
            self.assertTupleEqual(res, cand[1:])
    def test_people_with_multiple_forenames(self):
        candidates = [
            ("Axel E. Fischer", "", "", "Axel", "Fischer"),
            ("Dr. Johann David Wadephul", "", "Dr.", "Johann", "Wadephul"),
            ("Bettina Margarethe Wiesmann", "", "", "Bettina", "Wiesmann"),
            ("Dr. Ernst Dieter Rossmann", "", "Dr.", "Ernst", "Rossmann"),
            ("Mariana Iris Harder-Kühnel", "", "", "Mariana", "Harder-Kühnel"),
            ("Tobias Matthias Peterka", "", "", "Tobias", "Peterka"),
            ("Eberhardt Alexander Gauland", "", "", "Eberhardt", "Gauland")
        ]

        for cand in candidates:
            res = split_name_str(cand[0])
            self.assertTupleEqual(res, cand[1:])
    def test_people_with_noble_titles(self):
        candidates = [
            ("Beatrix von Storch", "", "", "Beatrix", "von Storch"),
            ("Dr. Konstantin von Notz", "", "Dr.", "Konstantin", "von Notz"),
            ("Berengar Elsner von Gronow", "", "", "Berengar", "von Gronow"),
            ("Dr. Daniela De Ridder", "", "Dr.", "Daniela", "De Ridder"),
            ("Christian Frhr. von Stetten", "", "", "Christian", "Frhr. von Stetten"),
            ("Hans-Georg von der Marwitz", "", "", "Hans-Georg", "von der Marwitz"),
            ("Dr. Thomas de Maizière", "", "Dr.", "Thomas", "de Maizière")
        ]

        for cand in candidates:
            res = split_name_str(cand[0])
            self.assertTupleEqual(res, cand[1:])
def _build_mdb(person_str, add_debug_obj):
    # the following lines are a workaround for the somehow not working
    # optional matching group for the Abg. string. If someone finds a way to
    # get this optional matching group working feel free to remove also
    # remove the following lines
    cut_idx = person_str.find("Abg.")
    if cut_idx >= 0:
        cut_idx = person_str.find(" ", cut_idx)
        person_str = person_str[cut_idx:].strip()

    person_str = person_str.replace("(", "[")
    person_str = person_str.replace(")", "]")

    num_opening_brackets = person_str.count("[")
    num_closing_brackets = person_str.count("]")

    if num_opening_brackets != num_closing_brackets:
        logger.warning(
            "the received person_str \"{}\" contains not the same amount of "
            "opening brackets as closing brackets. this might become a "
            "problem shortly after this...".format(person_str))

    if num_opening_brackets > num_closing_brackets:
        person_str = person_str.lstrip("[")

    work_str = person_str
    person_parts = list()
    metadata_parts = list()
    while "[" in work_str:
        start_idx = work_str.find("[")
        end_idx = work_str.find("]", start_idx) + 1

        person_parts.append(work_str[:start_idx].strip())
        metadata_parts.append(work_str[start_idx:end_idx].strip().strip("[]"))
        work_str = work_str[end_idx:]

    full_name = " ".join(person_parts)

    faction = ""
    for mp in metadata_parts:
        found_factions = Faction.in_text(mp)
        if found_factions:
            if len(found_factions) != 1:
                logger.info(f"Found factions != 1: {found_factions}")
            assert len(found_factions) == 1
            faction = found_factions[0]
            break

    membership = list()
    if faction:
        membership = [(datetime.min, None, faction)]

    full_name = full_name.replace("- ", "-")
    full_name = full_name.replace(" -", "-")

    role, title, forename, surname = split_name_str(full_name)

    # detection of malformed extractions (will later remove interactions with malformed MDB)
    # check that forename and surname are filled and have more than one char in them
    malformed = not forename or len(forename) <= 1
    malformed = malformed or not surname or len(surname) <= 1
    # check if forename starts with a small char
    malformed = malformed or forename[0].islower()
    if not malformed:
        extended_keywords = keywords.copy()
        extended_keywords.update([
            "am", "um", "ne", "wo", "Wo", ".", "-", "der", "die", "das", "des",
            "von", "an", "h", "h."
        ])
        for k in extended_keywords:
            malformed = malformed or k in full_name.split(" ")
            malformed = malformed or k == forename.lower(
            ) or k == surname.lower()
            if malformed:
                break

    if malformed:
        return MalformedMDB(person_str, forename, surname, membership)

    debug_info = None
    if add_debug_obj:
        debug_info = {
            "constructed_from_text": True,
            "creation_person_str": person_str
        }

    return MDB.find_and_add_in_storage(forename=forename,
                                       surname=surname,
                                       memberships=membership,
                                       job_title=role,
                                       debug_info=debug_info,
                                       created_by="_buildMdb")
Esempio n. 7
0
    def _extract(
            block_el: bs4e.Tag,
            curr_speaker: MDB = None,
            curr_paragraph: str = None) \
            -> List[InteractionCandidate]:

        pms = list()
        for el in block_el:
            # there are random line breaks in the file which BeautifulSoup
            # makes accessible but we don't need
            if isinstance(el, bs4e.NavigableString):
                continue
            elif el.name == "name" or (el.name == "p" and el.get("klasse") == "N"):
                role, title, first_name, last_name = split_name_str(cleanup_str(el.getText().rstrip(":")))
                curr_speaker = {
                    "forename": cleanup_str(first_name),
                    "surname": cleanup_str(last_name),
                    "memberships": [(datetime.min, None, Faction.NONE)],
                    "job_title": role,
                    "title": title
                }
            elif el.name == "rede":
                pms += _extract(el, curr_speaker, curr_paragraph)
            elif el.name == "p":
                category = el.get("klasse")

                if category == "redner":
                    # workaround for the situation in which the fraktion tags in
                    # the xml somehow contain a direct speech formatted like this "SPD: ja."
                    faction_txt = _safe_get_text(el.redner, "fraktion")
                    if ":" in faction_txt:
                        faction_txt = faction_txt.split(":")[0].strip()

                    # TODO: Proper name and integrate into find_in_storage
                    curr_speaker = {
                        "mdb_number": el.redner.get("id"),
                        "forename": _safe_get_text(el.redner, "vorname"),
                        "surname": _safe_get_text(el.redner, "nachname"),
                        "memberships": [(datetime.min, None, Faction.from_name(faction_txt))],
                        "job_title": _safe_get_text(el.redner, "rolle_lang")}

                elif category in ["J", "J_1", "O", "Z"]:
                    new_para_str = cleanup_str(el.getText())
                    if curr_paragraph is not None:
                        if not curr_speaker:
                            # logger.warning(
                            #    "found a new paragraph but couldn't finish "
                            #    "the old one as there has been no speaker so "
                            #    "far! dropping the old one (\"{}\") now...".format(curr_paragraph))
                            curr_paragraph = new_para_str
                            continue

                        speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                            else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

                        pms.append(InteractionCandidate(
                            speaker=speaker,
                            paragraph=curr_paragraph,
                            comment=None))
                    curr_paragraph = new_para_str
                else:
                    logger.debug("Ignoring unhandled category \"{}\" of tag "
                                 "p.".format(category))
            elif el.name == "kommentar":
                if not curr_speaker:
                    if logging_is_needed(el.getText()):
                        logger.warning(
                            "found a comment but there has been no speaker so far"
                            "! skipping it (\"{}\") until we find a speaker...".format(
                                cleanup_str(el.getText())))
                    continue

                if not curr_paragraph:
                    logger.warning(
                        "found a comment but there has been no paragraph so far"
                        "! skipping it (\"{}\") until we find a paragraph...".format(
                            cleanup_str(el.getText())))
                    continue

                speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                    else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

                pms.append(InteractionCandidate(
                    speaker=speaker,
                    paragraph=curr_paragraph,
                    comment=cleanup_str(el.getText())))
                curr_paragraph = None

        # finish still open curr_paragraph
        if curr_paragraph is not None:
            if not curr_speaker:
                logger.warning(
                    "found a open paragraph but there has been no speaker so far"
                    "! skipping it (\"{}\"), but this should be investigated as it "
                    "means no speaker in the whole block has been found".format(
                        cleanup_str(curr_paragraph)))
                return pms

            speaker = curr_speaker if isinstance(curr_speaker, MDB) \
                else MDB.find_and_add_in_storage(**curr_speaker, created_by="manualXmlParser")

            pms.append(InteractionCandidate(
                speaker=speaker,
                paragraph=curr_paragraph,
                comment=None))

        return pms