Ejemplos de clean_text en Python, ejemplos de scrc.utils.main_utils.clean_text en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: judgment_extracting_functions.py Proyecto: JoelNiklaus/SwissCourtRulingCorpus

def CH_BGer(rulings: str, namespace: dict) -> Optional[List[Judgment]]:
    """
    Extract judgment outcomes from the rulings
    :param rulings:     the string containing the rulings
    :param namespace:   the namespace containing some metadata of the court decision
    :return:            the list of judgments
    """

    if namespace['language'] not in all_judgment_markers:
        message = f"This function is only implemented for the languages {list(all_judgment_markers.keys())} so far."
        raise ValueError(message)

    # make sure we don't have any nasty unicode problems
    rulings = clean_text(rulings)

    judgments = get_judgments(rulings, namespace)

    if not judgments:
        message = f"Found no judgment for the rulings \"{rulings}\" in the case {namespace['html_url']}. Please check!"
        raise ValueError(message)
    elif len(judgments) > 1:
        if Judgment.PARTIAL_APPROVAL in judgments:
            # if partial_approval is found, it will find approval as well
            judgments.discard(Judgment.APPROVAL)
        if Judgment.PARTIAL_DISMISSAL in judgments:
            # if partial_dismissal is found, it will find dismissal as well
            judgments.discard(Judgment.DISMISSAL)

    return judgments

Ejemplo n.º 2

0

Mostrar archivo

def get_pdf_paragraphs(soup: str) -> list:
    """
    Get the paragraphs of a decision
    :param soup:    the string extracted of the pdf
    :return:        a list of paragraphs
    """

    paragraphs = []
    # remove spaces between two line breaks
    soup = re.sub('\\n +\\n', '\\n\\n', soup)
    # split the lines when there are two line breaks
    lines = soup.split('\n\n')
    for element in lines:
        element = element.replace('  ', ' ')
        paragraph = clean_text(element)
        if paragraph not in ['', ' ', None]:  # discard empty paragraphs
            paragraphs.append(paragraph)
    return paragraphs

Ejemplo n.º 3

0

Mostrar archivo

Archivo: judgment_extracting_functions.py Proyecto: JoelNiklaus/SwissCourtRulingCorpus

def SZ_Gerichte(rulings: str, namespace: dict) -> Optional[List[Judgment]]:
    """
    Extract judgment outcomes from the rulings
    :param rulings:     the string containing the rulings
    :param namespace:   the namespace containing some metadata of the court decision
    :return:            the list of judgments
    """

    if namespace['language'] not in all_judgment_markers:
        message = f"This function is only implemented for the languages {list(all_judgment_markers.keys())} so far."
        raise ValueError(message)

    # make sure we don't have any nasty unicode problems
    rulings = clean_text(rulings)

    judgments = get_judgments(rulings, namespace)

    if not judgments:
        message = f"Found no judgment for the rulings \"{rulings}\" in the case {namespace['html_url']}. Please check!"
        raise ValueError(message)
    elif len(judgments) > 1:
        judgments = discard_judgment(judgments)
    return [judgment.value for judgment in judgments]

Ejemplo n.º 4

0

Mostrar archivo

def CH_BGer(header: str, namespace: dict) -> Optional[str]:
    """
    Extract lower courts from decisions of the Federal Supreme Court of Switzerland
    :param header:     the string containing the header
    :param namespace:   the namespace containing some metadata of the court decision
    :return:            the sections dict
    """
    # Define the needed regexes at the top of the file
    information_start_regex = r'Vorinstanz|Beschwerden?\sgegen|gegen\sden\s(Entscheid|Beschluss)|gegen\sdas\sUrteil|Gegenstand|Instance précédente|recours|révision de|ricorso|ricorrente|rettifica'

    information_regex = {
        'court_string': [
            r'(\w*gericht(?=s?[^\w]))',
            r'(?P<high_prio>Tribunal .*?(?=[,\.]| du| de la République et canton))',
            r'(?<![Rr]e)[Cc]our .*?(?=[,\.]| du| de la République et canton)',
            r'Tribunale .*?(?=[,\.]| del Cantone)'
        ],
        'canton': [
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\s))))(Appenzell Innerrhoden|Appenzell Rhodes-Intérieures|Appenzello Interno)',
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\s))))(Appenzell Ausserrhoden|Appenzell Rhodes-Extérieures|Appenzello Esterno)',
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\s))))Basel-Land',
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\s))))(St(\.)?\s?Gallen|San Gallo)',
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\svon\s))))[\wäöü-]*',
            r'((?<=des\s(?:Kantons\s))|((?<=des\s(?:Kantonsgerichts\s))))[\wäöü-]*',
            r'(?<=canton d[eu] )Bâle-(Ville|Campagne)',
            r'(?<=canton d[eu] )[\wéè]*', r'(?<=de l\'Etat de )[\wéè]*',
            r'((?<=del Cantone )|(?<=del Cantone di )|(?<=del Cantone dei ))(San Gallo)',
            r'((?<=del Cantone )|(?<=del Cantone di )|(?<=del Cantone dei ))(Appenzello (Interno|Esterno))',
            r'((?<=del Cantone )|(?<=del Cantone di )|(?<=del Cantone dei ))(Basilea (Città|Campagna))',
            r'(?<=del Cantone dei )[\wéè]*', r'(?<=del Cantone di )[\wéè]*',
            r'(?<=del Cantone del )[\wéè]*', r'(?<=del Cantone )[\wéè]*'
        ],
        'date': [
            r'(?P<DATE>(?P<DAY>\d?\d|1(re|er)|2e|3e|premier|première|deuxième|troisième|1°)\.?\s(?P<MONTH>\w{2,12})\s(?P<YEAR>\d{4}))'
        ],
        'chamber_string': [
            r'[IVX\d]+.\s\w*ammer', r'\w*ammer', r'[IVX\d]+.\s\w*our',
            r'(?P<high_prio>[Cc]hambre.*?(?=[,\.]| du| de la [Cc]our))',
            r'(?<![Rr]e)[Cc]our.*?(?=[,\.]| du| de la [Cc]our)',
            r'[Cc]orte.*?(?=[,\.]| del Tribunale| del Cantone)',
            r'[Cc]amera.*?(?=[,\.]| del Tribunale| del Cantone)',
            r'Abteilung\s[\dIVX]+', r'[IVX\d]+.\s(\w+\s)?Abteilung'
        ],
        'file_number': [
            r'(?P<ID>[A-Z0-9]{2,6})[\.\s\-]?(?P<YEAR>\d{2,4})[\.\s\-]?(?P<NUMBER>[\dA-Z\-]{2,8})(?=\))',
            # ex: AB12.2021.13
            r'[A-Z0-9]{1,4}([\.\-_\/\s])\d{1,8}[\.\/\-]?(\d{4}|[A-Z\/]+(\d+)?)',  # ex: AB-12/2021
            r'[A-Z0-9]{1,3}(\s|\.)?((([\d]{3,6})|\/)\s??){2,6}(-[A-Z])?'  # ex: 720 16 328 / 176
        ]
    }

    def prepareCantonForQuery(canton: str, court_chambers_data) -> str:
        """ Try to match the canton as text with its corresponding abbreviation """
        for canton_short in court_chambers_data:
            current_canton = court_chambers_data[canton_short]
            if current_canton['de'] == canton or \
                    current_canton['fr'] == canton or \
                    current_canton['it'] == canton:
                return canton_short  # If the canton is found, return the corresponding abbreviation
        print(canton)

    def prepareCourtForQuery(court: str, canton: str,
                             court_chambers_data) -> str:
        """ Try to match the court as text with its corresponding abbreviation """
        canton_court_data = court_chambers_data[canton]
        for current_court_short in canton_court_data['gerichte']:
            current_court = canton_court_data['gerichte'][current_court_short]
            if current_court['de'] == court or \
                    current_court['fr'] == court or \
                    current_court['it'] == court:
                return current_court_short  # If the court is found, return the corresponding abbreviation

    def prepareChamberForQuery(chamber: str, court: str, canton: str,
                               court_chambers_data) -> str:
        """ Try to match the chamber as text with its corresponding abbreviation """
        if court not in court_chambers_data[canton][
                'gerichte']:  # The court is not found in the list, so no chance of finding the chamber
            return chamber
        possible_labels = court_chambers_data[canton]['gerichte'][court][
            'kammern']  # Get the list of all possible chambers for the court
        for current_short in possible_labels:  # Try to match the chamber with one of the possible labels
            current_court_data = court_chambers_data[canton]['gerichte'][
                court]['kammern'][current_short]
            if {'de', 'fr', 'it'} <= current_court_data.keys(
            ):  # If the chamber is found, return the corresponding abbreviation
                if chamber in current_court_data['de'] or \
                        chamber in current_court_data['fr'] or \
                        chamber in current_court_data['it']:
                    return current_short
                chamber_without_number = re.sub(
                    r'[IV0-9]*.\s', '', chamber
                )  # Remove the number from the chamber name to try and match it again
                if chamber_without_number in current_court_data['de'] or \
                        chamber_without_number in current_court_data['fr'] or \
                        chamber_without_number in current_court_data['it']:
                    return current_short

    def prepareDateForQuery(date: str) -> str:
        # Replace some strings with different to get a valid date
        translation_dict = {
            "Januar": "Jan",
            "Februar": "Feb",
            "März": "Mar",
            "Mai": "May",
            "Juni": "June",
            "Juli": "July",
            "Oktober": "Oct",
            "Dezember": "Dec",
            "Janvier": "Jan",
            "Février": "Feb",
            "Mars": "Mar",
            "Avril": "April",
            "Juin": "june",
            "Juillet": "July",
            "Août": "Aug",
            "Septembre": "Sept",
            "Octobre": "Oct",
            "Novembre": "Nov",
            "Décembre": "Dec",
            "Gennaio": "Jan",
            "Febbraio": "Feb",
            "Marzo": "Mar",
            "Aprile": "Apr",
            "Maggio": "May",
            "Giugno": "June",
            "Luglio": "July",
            "Agosto": "Aug",
            "Settembre": "Sept",
            "Ottobre": "Oct",
            "Novembre": "Nov",
            "Dicembre": "Dec",
            "1er": "01",
            "1re": "01",
            "2e": "02",
            "3e": "03",
            "premier": "01",
            "première": "01",
            "deuxième": "02",
            "troisième": "03",
            "1°": "01"
        }

        for k, v in translation_dict.items():
            date = date.replace(k, v)
            date = date.replace(k.lower(), v)

        return pd.to_datetime(date, errors='ignore',
                              dayfirst=True).strftime('%Y-%m-%d')

    def get_lower_court_by_date_and_court(
            lower_court_information) -> Optional[str]:

        if 'canton' in lower_court_information:
            court_chambers_data = json.loads(
                Path("legal_info/court_chambers.json").read_text())
            # There was information of the canton in the text, tries to match the canton with its abbreviation
            lower_court_information['canton'] = prepareCantonForQuery(
                lower_court_information['canton'], court_chambers_data)
            # If the canton is found, tries to match the court with its abbreviation
            if 'court_string' in lower_court_information and lower_court_information[
                    'canton'] is not None:
                lower_court_information['court'] = prepareCourtForQuery(
                    lower_court_information['court_string'],
                    lower_court_information['canton'], court_chambers_data)
        else:  # No canton information was found in the text, tries to match the court for the federal level
            if 'court_string' in lower_court_information:
                court_chambers_data = json.loads(
                    Path("legal_info/court_chambers.json").read_text())
                # Tries to match a court with the text on a federal level
                lower_court_information['court'] = prepareCourtForQuery(
                    lower_court_information['court_string'], 'CH',
                    court_chambers_data)
                if re.match(r'CH_', lower_court_information['court']):
                    lower_court_information['canton'] = 'CH'

        if {'canton', 'chamber_string', 'court'
            } <= lower_court_information.keys() and all(
                value is not None for value in [
                    lower_court_information['chamber_string'],
                    lower_court_information['court'],
                    lower_court_information['canton']
                ]):
            # Try to find the chamber is canton, chamber_string and court are all present
            lower_court_information['chamber'] = prepareChamberForQuery(
                lower_court_information['chamber_string'],
                lower_court_information['court'],
                lower_court_information['canton'],
                json.loads(Path("legal_info/court_chambers.json").read_text()))
        if 'date' in lower_court_information:
            # Include the date in the returned information
            lower_court_information['date'] = prepareDateForQuery(
                lower_court_information['date'])

        return lower_court_information

    def get_court_information(header, namespace):
        result = {}
        start_pos = re.search(information_start_regex, header) or re.search(
            r', gegen|Beschwerdeführer', header)
        if start_pos:
            header = header[start_pos.span()[0]:]

        for information_key in information_regex:  # For each category try to match the regex in the text
            regex = '|'.join(information_regex[information_key])
            # not a normal regex search so we find last occurence
            regex_result = None
            for regex_result in re.finditer(regex, header):
                if 'high_prio' in regex_result.groupdict(
                ) and regex_result.group('high_prio') != None:
                    # Groups named 'high_prio' are the ones that are more important and therefore should be returned
                    break
                pass
            if regex_result:
                if 'high_prio' in regex_result.groupdict(
                ) and regex_result.group('high_prio') != None:
                    # Groups named 'high_prio' are the ones that are more important and therefore should be returned
                    result[information_key] = regex_result.group('high_prio')
                else:
                    result[information_key] = regex_result.group()

        return result

    # make sure we don't have any nasty unicode problems
    header = clean_text(header)
    header = header.replace('Appenzell I.Rh.', 'Appenzell Innerrhoden')
    header = header.replace('Appenzell A.Rh.', 'Appenzell Ausserrhoden')
    header = header.replace('Appenzell I. Rh.', 'Appenzell Innerrhoden')
    header = header.replace('Appenzell A. Rh.', 'Appenzell Ausserrhoden')
    header = header.replace('Waadt', 'Waadtland')
    header = header.replace('Basilea-Città', 'Basilea Città')
    header = header.replace('St. Gallen', 'St.Gallen')

    lower_court = None
    # lower_court_file_number = get_lower_court_file_number(header, namespace)
    lower_court_information = get_court_information(header, namespace)
    """ if lower_court_file_number:
        print(f'Got File number of previous court {lower_court_file_number} \n{header}\n{namespace["html_url"]}')
        lower_court = get_lower_court_by_file_number(lower_court_file_number)
        input() """
    if lower_court is None:
        try:
            lower_court = get_lower_court_by_date_and_court(
                lower_court_information)
            # print(header, lower_court, sep="\n")
        except:
            return None
    return lower_court or None

Ejemplo n.º 5

0

Mostrar archivo

Archivo: cleaner.py Proyecto: JoelNiklaus/SwissCourtRulingCorpus

 def clean_html(self, spider: str, soup: bs4.BeautifulSoup,
                namespace: dict) -> str:
     """Cleans first the text first with court specific regexes and then with general ones"""
     cleaned_text = self.clean_with_functions(spider, soup, namespace)
     return clean_text(cleaned_text)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: cleaner.py Proyecto: JoelNiklaus/SwissCourtRulingCorpus

 def clean_pdf(self, spider: str, text: str, namespace: dict) -> str:
     """Cleans first the text first with court specific regexes and then with general ones"""
     cleaned_text = self.clean_with_regexes(spider, text, namespace)
     return clean_text(cleaned_text)