コード例 #1
0
    def save_data_to_database(self, df: pd.DataFrame, engine: Engine):
        if not AbstractPreprocessor._check_write_privilege(engine):
            AbstractPreprocessor.create_dir(self.output_dir, os.getlogin())
            path = Path.joinpath(self.output_dir, os.getlogin(),
                                 datetime.now().isoformat() + '.json')

            with path.open("a") as f:
                df.to_json(f)
            return
        for idx, row in df.iterrows():
            with engine.connect() as conn:
                t = Table('judgment_map', MetaData(), autoload_with=engine)
                if row['judgments']:  # only insert, when we find judgments
                    # Delete and reinsert as no upsert command is available
                    stmt = t.delete().where(delete_stmt_decisions_with_df(df))
                    conn.execute(stmt)
                    for k in row['judgments']:
                        judgment_type_id = Judgment(k).value
                        stmt = t.insert().values([{
                            "decision_id":
                            str(row['decision_id']),
                            "judgment_id":
                            judgment_type_id
                        }])
                        conn.execute(stmt)
                else:
                    self.logger.warning(
                        f"No judgments found for {row['html_url']}")
コード例 #2
0
    def save_data_to_database(self, df: pd.DataFrame, engine: Engine):
        with engine.connect() as conn:
            t = Table('lower_court', MetaData(), autoload_with=conn)
            # Delete and reinsert as no upsert command is available
            stmt = t.delete().where(delete_stmt_decisions_with_df(df))
            conn.execute(stmt)
            for _, row in df.iterrows():
                if not 'lower_court' in row or row['lower_court'] is None:
                    continue
                lower_court = row["lower_court"]
                res = {}

                if 'court' in lower_court and lower_court['court'] is not None:
                    res['court_id'] = list(
                        self.select(
                            engine, 'court', 'court_id',
                            f"court_string = '{lower_court['court']}'")
                    )[0]['court_id'][0]
                    res['court_id'] = int(
                        res['court_id']
                    ) if res['court_id'] is not None else None
                if 'canton' in lower_court and lower_court[
                        'canton'] is not None:
                    res['canton_id'] = list(
                        self.select(engine, 'canton', 'canton_id',
                                    f"short_code = '{lower_court['canton']}'")
                    )[0]['canton_id'][0]
                    res['canton_id'] = int(
                        res['canton_id']
                    ) if res['canton_id'] is not None else None
                if 'chamber' in lower_court and lower_court[
                        'chamber'] is not None:
                    res['chamber_id'] = list(
                        self.select(
                            engine, 'chamber', 'chamber_id',
                            f"chamber_string = '{lower_court['chamber']}'")
                    )[0]['chamber_id'][0]
                    res['chamber_id'] = int(
                        res['chamber_id']
                    ) if res['chamber_id'] is not None else None

                    stmt = t.insert().values([{
                        "decision_id":
                        str(row['decision_id']),
                        "court_id":
                        res.get('court_id'),
                        "canton_id":
                        res.get('canton_id'),
                        "chamber_id":
                        res.get('chamber_id'),
                        "date":
                        lower_court.get('date'),
                        "file_number":
                        lower_court.get('file_number')
                    }])
                    conn.execute(stmt)
コード例 #3
0
 def save_data_to_database(self, df: pd.DataFrame, engine: Engine):
     for idx, row in df.iterrows():
         with engine.connect() as conn:
             t = Table('citation', MetaData(), autoload_with=engine)
             # Delete and reinsert as no upsert command is available
             stmt = t.delete().where(delete_stmt_decisions_with_df(df))
             engine.execute(stmt)
             for k in row['citations'].keys():
                 citation_type_id = CitationType(k).value
                 for citation in row['citations'][k]:
                     stmt = t.insert().values([{
                         "decision_id":
                         str(row['decision_id']),
                         "citation_type_id":
                         citation_type_id,
                         "url":
                         citation.get("url"),
                         "text":
                         citation["text"]
                     }])
                     engine.execute(stmt)
コード例 #4
0
 def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame:
     """
     Saves the file_number for each of the decision ids
     :param series:
     :return:
     """
     query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'"
     series['decision_id'] = pd.read_sql(query,
                                         engine.connect())["decision_id"][0]
     with engine.connect() as conn:
         t = Table('file_number', MetaData(), autoload_with=engine)
         # Delete and reinsert as no upsert command is available
         stmt = t.delete().where(delete_stmt_decisions_with_df(series))
         conn.execute(stmt)
     series['text'] = series['file_number'].strip(
     )  # .map(lambda x: x.strip())
     save_to_db(series[['decision_id', 'text']], 'file_number')
     if ('file_number_additional' in series
             and series['file_number_additional'] is not None
             and len(series['file_number_additional']) > 0):
         series['text'] = series['file_number_additional'].strip(
         )  # .map(lambda x: x.strip())
         save_to_db(series[['decision_id', 'text']], 'file_number')
     return series
コード例 #5
0
    def save_data_to_database(self, df: pd.DataFrame, engine: Engine):

        if not AbstractPreprocessor._check_write_privilege(engine):
            AbstractPreprocessor.create_dir(self.output_dir, os.getlogin())
            path = Path.joinpath(self.output_dir, os.getlogin(),
                                 datetime.now().isoformat() + '.json')

            with path.open("a") as f:
                df.to_json(f)
            return

        df = df.loc[df['language'] != '--']
        if df.empty:
            return

        df = self.run_tokenizer(df)

        with self.get_engine(self.db_scrc).connect() as conn:
            # Load the different tables
            t = Table('section', MetaData(), autoload_with=engine)
            t_paragraph = Table('paragraph', MetaData(), autoload_with=engine)
            t_num_tokens = Table('num_tokens',
                                 MetaData(),
                                 autoload_with=engine)

            # Delete and reinsert as no upsert command is available. This pattern is used multiple times in this method
            if not isinstance(df, pd.DataFrame) or df.empty:
                # empty dfs are given as dicts, so no need to save
                return
            stmt = t.delete().where(delete_stmt_decisions_with_df(df))
            conn.execute(stmt)
            stmt = t_paragraph.delete().where(
                delete_stmt_decisions_with_df(df))
            conn.execute(stmt)

            for idx, row in df.iterrows():
                if idx % 50 == 0:
                    self.logger.info(f'Saving decision {idx + 1} from chunk')
                if row['sections'] is None or row['sections'].keys is None:
                    continue

                for k in row['sections'].keys():
                    decision_id_str = str(row['decision_id'])
                    if decision_id_str == '':
                        continue
                    section_type_id = k.value
                    # insert section
                    section_dict = {
                        "decision_id": decision_id_str,
                        "section_type_id": section_type_id,
                        "section_text": row['sections'][k]
                    }
                    stmt = t.insert().returning(text("section_id")).values(
                        [section_dict])
                    section_id = conn.execute(stmt).fetchone()['section_id']

                    # Add num tokens
                    tokens_per_section = {
                        'section_id': str(section_id),
                        'num_tokens_spacy': row[k.name + '_spacy'],
                        'num_tokens_bert': row[k.name + '_bert']
                    }

                    stmt = t_num_tokens.insert().values([tokens_per_section])
                    conn.execute(stmt)

                    # Add all paragraphs
                    paragraph_dicts = []
                    for paragraph in row['sections'][k]:
                        paragraph = paragraph.strip()
                        if len(paragraph) == 0: continue
                        paragraph_dict = {
                            'section_id': str(section_id),
                            "decision_id": decision_id_str,
                            'paragraph_text': paragraph,
                            'first_level': None,
                            'second_level': None,
                            'third_level': None
                        }
                        paragraph_dicts.append(paragraph_dict)
                    if len(paragraph_dicts) > 0:
                        stmt = t_paragraph.insert().values(paragraph_dicts)
                        conn.execute(stmt)
コード例 #6
0
def save_from_text_to_database(engine: Engine, df: pd.DataFrame):
    """ Saving these fields
            Column('language', String),
            Column('chamber', String),
            Column('date', Date),
            Column('file_name', String),
            Column('file_number', String),
            Column('file_number_additional', String),
            Column('html_url', String),
            Column('html_raw', String),
            Column('pdf_url', String),
            Column('pdf_raw', String),
    """
    def save_to_db(df: pd.DataFrame, table: str):
        # If the returned df is not a DataFrame but a Series, then convert it into a dataframe and Transpose it to correct the variable. (Not needed for most courts, but edge case needs it)
        if not isinstance(df, pd.DataFrame):
            df = df.to_frame()
            df = df.T
        df.to_sql(table, engine, if_exists="append", index=False)

    def add_ids_to_df_for_decision(series: pd.DataFrame) -> pd.DataFrame:
        query = f"SELECT file_id FROM file WHERE file_name = '{series['file_name']}'"
        series['file_id'] = pd.read_sql(query, engine.connect())["file_id"][0]
        series['language_id'] = -1
        query = f"SELECT chamber_id FROM chamber WHERE chamber_string = '{series['chamber']}'"
        chamber_id = pd.read_sql(query, engine.connect())['chamber_id']
        if len(chamber_id) == 0:
            print(
                f"The chamber {series['chamber']} was not found in the database. "
                f"Add it with the respective court and spider")
            raise ValueError
        else:
            series['chamber_id'] = chamber_id[0]

        series['decision_id'] = uuid.uuid5(uuid.UUID(int=0),
                                           series['file_name'])
        # TODO: Add topic recognition, similar to the title of the court decision
        series['topic'] = ''
        return series

    def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame:
        """
        Saves the file_number for each of the decision ids
        :param series:
        :return:
        """
        query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'"
        series['decision_id'] = pd.read_sql(query,
                                            engine.connect())["decision_id"][0]
        with engine.connect() as conn:
            t = Table('file_number', MetaData(), autoload_with=engine)
            # Delete and reinsert as no upsert command is available
            stmt = t.delete().where(delete_stmt_decisions_with_df(series))
            conn.execute(stmt)
        series['text'] = series['file_number'].strip(
        )  # .map(lambda x: x.strip())
        save_to_db(series[['decision_id', 'text']], 'file_number')
        if ('file_number_additional' in series
                and series['file_number_additional'] is not None
                and len(series['file_number_additional']) > 0):
            series['text'] = series['file_number_additional'].strip(
            )  # .map(lambda x: x.strip())
            save_to_db(series[['decision_id', 'text']], 'file_number')
        return series

    if df.empty:
        return

    # Delete old decision and file entries
    with engine.connect() as conn:
        t_fil = Table('file', MetaData(), autoload_with=engine)
        t_dec = Table('decision', MetaData(), autoload_with=engine)
        file_name_list = ','.join(
            ["'" + str(item) + "'" for item in df['file_name'].tolist()])
        stmt = t_fil.select().where(text(f"file_name in ({file_name_list})"))
        file_ids = [item['file_id'] for item in conn.execute(stmt).all()]
        if len(file_ids) > 0:
            file_ids_list = ','.join(
                ["'" + str(item) + "'" for item in file_ids])
            # decision_ids = [item['decision_id'] for item in conn.execute(t_dec.select().where(text(f"file_id in ({file_ids_list})"))).all()]

            stmt = t_dec.delete().where(text(f"file_id in ({file_ids_list})"))
            conn.execute(stmt)
            stmt = t_fil.delete().where(text(f"file_id in ({file_ids_list})"))
            conn.execute(stmt)

    save_to_db(df[['file_name', 'html_url', 'pdf_url', 'html_raw', 'pdf_raw']],
               'file')

    df = df.apply(add_ids_to_df_for_decision, 1)

    df = df.replace(
        {np.NaN: None}
    )  # Convert pandas NaT values (Non-Type for Datetime) to None using np as np recognizes these types
    df['date'] = df['date'].replace(r'^\s*$', None, regex=True)
    df['date'] = df['date'].astype('datetime64[ns]')
    save_to_db(df[['language_id', 'chamber_id', 'file_id', 'date', 'topic']],
               'decision')
    df.apply(save_the_file_numbers, 1)
    def save_data_to_database(self, df: pd.DataFrame, engine: Engine):
        """
            'plaintiff': 1,
            'defendant': 2,
            'representation_plaintiff': 3,
            'representation_defendant': 4
        """
        for idx, row in df.iterrows():
            with engine.connect() as conn:
                t_person = Table('person', MetaData(), autoload_with=conn)
                t_party = Table('party', MetaData(), autoload_with=conn)

                # Delete person
                # Delete and reinsert as no upsert command is available
                stmt = t_party.delete().where(delete_stmt_decisions_with_df(df))
                conn.execute(stmt)

                parties = json.loads(row['parties'])

                # create all plaintiffs
                offset = 0
                for plaintiff in parties.get('plaintiffs'):
                    # Insert person into the person table
                    plaintiff_dict = {"name": plaintiff['name'].strip(),
                                      "is_natural_person": plaintiff['legal_type'] == 'natural person',
                                      "gender": plaintiff['gender']}
                    stmt = t_person.insert().returning(column("person_id")).values([plaintiff_dict])
                    person_id = conn.execute(stmt).fetchone()['person_id']
                    # Then insert the person into the party table
                    stmt = t_party.insert().values(
                        [{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 1 + offset}])
                    conn.execute(stmt)
                    for representant in plaintiff.get('legal_counsel'):
                        # Insert their representation into the person and party tables
                        representant_dict = {"name": representant['name'].strip(),
                                             "is_natural_person": representant['legal_type'] == 'natural person',
                                             "gender": representant['gender']}
                        stmt = t_person.insert().returning(text("person_id")).values([representant_dict])
                        person_id = conn.execute(stmt).fetchone()['person_id']
                        stmt = t_party.insert().values([{"decision_id": str(row['decision_id']), "person_id": person_id,
                                                         "party_type_id": 3 + offset}])
                        conn.execute(stmt)

                # create all defendants
                offset = 1
                for defendant in parties.get('defendants'):
                    # Insert person into the person table
                    defendant_dict = {"name": plaintiff['name'].strip(),
                                      "is_natural_person": plaintiff['legal_type'] == 'natural person',
                                      "gender": plaintiff['gender']}
                    stmt = t_person.insert().returning(text("person_id")).values([defendant_dict])
                    person_id = conn.execute(stmt).fetchone()['person_id']
                    # Then insert the person into the party table
                    stmt = t_party.insert().values(
                        [{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 1 + offset}])
                    conn.execute(stmt)
                    for representant in defendant.get('legal_counsel'):
                        # Insert their representation into the person and party tables
                        representant_dict = {"name": representant['name'].strip(),
                                             "is_natural_person": representant['legal_type'] == 'natural person',
                                             "gender": representant['gender']}
                        stmt = t_person.insert().returning(text("person_id")).values([representant_dict])
                        person_id = conn.execute(stmt).fetchone()['person_id']
                        stmt = t_party.insert().values([{"decision_id": str(row['decision_id']), "person_id": person_id,
                                                         "party_type_id": 3 + offset}])
                        conn.execute(stmt)

        with engine.connect() as conn:
            stmt = t_person.delete().where(text(
                f"NOT EXISTS (SELECT FROM judicial_person jp WHERE jp.person_id = person.person_id UNION SELECT FROM party WHERE party.person_id = person.person_id)"))
            conn.execute(stmt)
コード例 #8
0
    def save_data_to_database(self, df: pd.DataFrame, engine: Engine):
        with engine.connect() as conn:
            t_person = Table('person', MetaData(), autoload_with=conn)
            t_jud_person = Table('judicial_person',
                                 MetaData(),
                                 autoload_with=conn)

            # Delete and reinsert as no upsert command is available
            stmt = t_jud_person.delete().where(
                delete_stmt_decisions_with_df(df))
            conn.execute(stmt)

            for _, row in df.iterrows():
                if not 'court_composition' in row or row[
                        'court_composition'] is None:
                    continue
                court_composition: CourtComposition = row['court_composition']
                president = court_composition.president
                if president:
                    # create president person
                    president_dict = {
                        "name":
                        president.name.strip(),
                        "is_natural_person":
                        True,
                        "gender":
                        president.gender.value[0] if president.gender else None
                    }
                    stmt = t_person.insert().returning(
                        text("person_id")).values([president_dict])
                    person_id = conn.execute(stmt).fetchone()['person_id']
                    person_dict = {
                        "decision_id": str(row['decision_id']),
                        "person_id": person_id,
                        "judicial_person_type_id": 1,
                        "is_president": True
                    }
                    stmt = t_jud_person.insert().values([person_dict])
                    conn.execute(stmt)

                # create all judges
                for judge in court_composition.judges:
                    if judge == president:  # President is already created above
                        continue
                    judge_dict = {
                        "name": judge.name.strip(),
                        "is_natural_person": True,
                        "gender":
                        judge.gender.value[0] if judge.gender else None
                    }
                    stmt = t_person.insert().returning(
                        text("person_id")).values([judge_dict])
                    person_id = conn.execute(stmt).fetchone()['person_id']
                    person_dict = {
                        "decision_id": str(row['decision_id']),
                        "person_id": person_id,
                        "judicial_person_type_id": 1,
                        "is_president": False
                    }
                    stmt = t_jud_person.insert().values([person_dict])
                    conn.execute(stmt)

                # create all clerks
                for clerk in court_composition.clerks:
                    clerk_dict = {
                        "name": clerk.name.strip(),
                        "is_natural_person": True,
                        "gender":
                        clerk.gender.value[0] if clerk.gender else None
                    }
                    stmt = t_person.insert().returning(
                        text("person_id")).values([clerk_dict])
                    person_id = conn.execute(stmt).fetchone()['person_id']
                    person_dict = {
                        "decision_id": str(row['decision_id']),
                        "person_id": person_id,
                        "judicial_person_type_id": 2,
                        "is_president": False
                    }
                    stmt = t_jud_person.insert().values([person_dict])
                    conn.execute(stmt)

            where = f"WHERE NOT EXISTS (SELECT 1 FROM judicial_person b WHERE a.person_id = b.person_id) " \
                    f"AND NOT EXISTS (SELECT 1 FROM party c WHERE a.person_id = c.person_id)"
            stmt = t_person.delete().where(text(where))