def save_data_to_database(self, df: pd.DataFrame, engine: Engine): if not AbstractPreprocessor._check_write_privilege(engine): AbstractPreprocessor.create_dir(self.output_dir, os.getlogin()) path = Path.joinpath(self.output_dir, os.getlogin(), datetime.now().isoformat() + '.json') with path.open("a") as f: df.to_json(f) return for idx, row in df.iterrows(): with engine.connect() as conn: t = Table('judgment_map', MetaData(), autoload_with=engine) if row['judgments']: # only insert, when we find judgments # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(df)) conn.execute(stmt) for k in row['judgments']: judgment_type_id = Judgment(k).value stmt = t.insert().values([{ "decision_id": str(row['decision_id']), "judgment_id": judgment_type_id }]) conn.execute(stmt) else: self.logger.warning( f"No judgments found for {row['html_url']}")
def save_data_to_database(self, df: pd.DataFrame, engine: Engine): with engine.connect() as conn: t = Table('lower_court', MetaData(), autoload_with=conn) # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(df)) conn.execute(stmt) for _, row in df.iterrows(): if not 'lower_court' in row or row['lower_court'] is None: continue lower_court = row["lower_court"] res = {} if 'court' in lower_court and lower_court['court'] is not None: res['court_id'] = list( self.select( engine, 'court', 'court_id', f"court_string = '{lower_court['court']}'") )[0]['court_id'][0] res['court_id'] = int( res['court_id'] ) if res['court_id'] is not None else None if 'canton' in lower_court and lower_court[ 'canton'] is not None: res['canton_id'] = list( self.select(engine, 'canton', 'canton_id', f"short_code = '{lower_court['canton']}'") )[0]['canton_id'][0] res['canton_id'] = int( res['canton_id'] ) if res['canton_id'] is not None else None if 'chamber' in lower_court and lower_court[ 'chamber'] is not None: res['chamber_id'] = list( self.select( engine, 'chamber', 'chamber_id', f"chamber_string = '{lower_court['chamber']}'") )[0]['chamber_id'][0] res['chamber_id'] = int( res['chamber_id'] ) if res['chamber_id'] is not None else None stmt = t.insert().values([{ "decision_id": str(row['decision_id']), "court_id": res.get('court_id'), "canton_id": res.get('canton_id'), "chamber_id": res.get('chamber_id'), "date": lower_court.get('date'), "file_number": lower_court.get('file_number') }]) conn.execute(stmt)
def save_data_to_database(self, df: pd.DataFrame, engine: Engine): for idx, row in df.iterrows(): with engine.connect() as conn: t = Table('citation', MetaData(), autoload_with=engine) # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(df)) engine.execute(stmt) for k in row['citations'].keys(): citation_type_id = CitationType(k).value for citation in row['citations'][k]: stmt = t.insert().values([{ "decision_id": str(row['decision_id']), "citation_type_id": citation_type_id, "url": citation.get("url"), "text": citation["text"] }]) engine.execute(stmt)
def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame: """ Saves the file_number for each of the decision ids :param series: :return: """ query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'" series['decision_id'] = pd.read_sql(query, engine.connect())["decision_id"][0] with engine.connect() as conn: t = Table('file_number', MetaData(), autoload_with=engine) # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(series)) conn.execute(stmt) series['text'] = series['file_number'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') if ('file_number_additional' in series and series['file_number_additional'] is not None and len(series['file_number_additional']) > 0): series['text'] = series['file_number_additional'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') return series
def save_data_to_database(self, df: pd.DataFrame, engine: Engine): if not AbstractPreprocessor._check_write_privilege(engine): AbstractPreprocessor.create_dir(self.output_dir, os.getlogin()) path = Path.joinpath(self.output_dir, os.getlogin(), datetime.now().isoformat() + '.json') with path.open("a") as f: df.to_json(f) return df = df.loc[df['language'] != '--'] if df.empty: return df = self.run_tokenizer(df) with self.get_engine(self.db_scrc).connect() as conn: # Load the different tables t = Table('section', MetaData(), autoload_with=engine) t_paragraph = Table('paragraph', MetaData(), autoload_with=engine) t_num_tokens = Table('num_tokens', MetaData(), autoload_with=engine) # Delete and reinsert as no upsert command is available. This pattern is used multiple times in this method if not isinstance(df, pd.DataFrame) or df.empty: # empty dfs are given as dicts, so no need to save return stmt = t.delete().where(delete_stmt_decisions_with_df(df)) conn.execute(stmt) stmt = t_paragraph.delete().where( delete_stmt_decisions_with_df(df)) conn.execute(stmt) for idx, row in df.iterrows(): if idx % 50 == 0: self.logger.info(f'Saving decision {idx + 1} from chunk') if row['sections'] is None or row['sections'].keys is None: continue for k in row['sections'].keys(): decision_id_str = str(row['decision_id']) if decision_id_str == '': continue section_type_id = k.value # insert section section_dict = { "decision_id": decision_id_str, "section_type_id": section_type_id, "section_text": row['sections'][k] } stmt = t.insert().returning(text("section_id")).values( [section_dict]) section_id = conn.execute(stmt).fetchone()['section_id'] # Add num tokens tokens_per_section = { 'section_id': str(section_id), 'num_tokens_spacy': row[k.name + '_spacy'], 'num_tokens_bert': row[k.name + '_bert'] } stmt = t_num_tokens.insert().values([tokens_per_section]) conn.execute(stmt) # Add all paragraphs paragraph_dicts = [] for paragraph in row['sections'][k]: paragraph = paragraph.strip() if len(paragraph) == 0: continue paragraph_dict = { 'section_id': str(section_id), "decision_id": decision_id_str, 'paragraph_text': paragraph, 'first_level': None, 'second_level': None, 'third_level': None } paragraph_dicts.append(paragraph_dict) if len(paragraph_dicts) > 0: stmt = t_paragraph.insert().values(paragraph_dicts) conn.execute(stmt)
def save_from_text_to_database(engine: Engine, df: pd.DataFrame): """ Saving these fields Column('language', String), Column('chamber', String), Column('date', Date), Column('file_name', String), Column('file_number', String), Column('file_number_additional', String), Column('html_url', String), Column('html_raw', String), Column('pdf_url', String), Column('pdf_raw', String), """ def save_to_db(df: pd.DataFrame, table: str): # If the returned df is not a DataFrame but a Series, then convert it into a dataframe and Transpose it to correct the variable. (Not needed for most courts, but edge case needs it) if not isinstance(df, pd.DataFrame): df = df.to_frame() df = df.T df.to_sql(table, engine, if_exists="append", index=False) def add_ids_to_df_for_decision(series: pd.DataFrame) -> pd.DataFrame: query = f"SELECT file_id FROM file WHERE file_name = '{series['file_name']}'" series['file_id'] = pd.read_sql(query, engine.connect())["file_id"][0] series['language_id'] = -1 query = f"SELECT chamber_id FROM chamber WHERE chamber_string = '{series['chamber']}'" chamber_id = pd.read_sql(query, engine.connect())['chamber_id'] if len(chamber_id) == 0: print( f"The chamber {series['chamber']} was not found in the database. " f"Add it with the respective court and spider") raise ValueError else: series['chamber_id'] = chamber_id[0] series['decision_id'] = uuid.uuid5(uuid.UUID(int=0), series['file_name']) # TODO: Add topic recognition, similar to the title of the court decision series['topic'] = '' return series def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame: """ Saves the file_number for each of the decision ids :param series: :return: """ query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'" series['decision_id'] = pd.read_sql(query, engine.connect())["decision_id"][0] with engine.connect() as conn: t = Table('file_number', MetaData(), autoload_with=engine) # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(series)) conn.execute(stmt) series['text'] = series['file_number'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') if ('file_number_additional' in series and series['file_number_additional'] is not None and len(series['file_number_additional']) > 0): series['text'] = series['file_number_additional'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') return series if df.empty: return # Delete old decision and file entries with engine.connect() as conn: t_fil = Table('file', MetaData(), autoload_with=engine) t_dec = Table('decision', MetaData(), autoload_with=engine) file_name_list = ','.join( ["'" + str(item) + "'" for item in df['file_name'].tolist()]) stmt = t_fil.select().where(text(f"file_name in ({file_name_list})")) file_ids = [item['file_id'] for item in conn.execute(stmt).all()] if len(file_ids) > 0: file_ids_list = ','.join( ["'" + str(item) + "'" for item in file_ids]) # decision_ids = [item['decision_id'] for item in conn.execute(t_dec.select().where(text(f"file_id in ({file_ids_list})"))).all()] stmt = t_dec.delete().where(text(f"file_id in ({file_ids_list})")) conn.execute(stmt) stmt = t_fil.delete().where(text(f"file_id in ({file_ids_list})")) conn.execute(stmt) save_to_db(df[['file_name', 'html_url', 'pdf_url', 'html_raw', 'pdf_raw']], 'file') df = df.apply(add_ids_to_df_for_decision, 1) df = df.replace( {np.NaN: None} ) # Convert pandas NaT values (Non-Type for Datetime) to None using np as np recognizes these types df['date'] = df['date'].replace(r'^\s*$', None, regex=True) df['date'] = df['date'].astype('datetime64[ns]') save_to_db(df[['language_id', 'chamber_id', 'file_id', 'date', 'topic']], 'decision') df.apply(save_the_file_numbers, 1)
def save_data_to_database(self, df: pd.DataFrame, engine: Engine): """ 'plaintiff': 1, 'defendant': 2, 'representation_plaintiff': 3, 'representation_defendant': 4 """ for idx, row in df.iterrows(): with engine.connect() as conn: t_person = Table('person', MetaData(), autoload_with=conn) t_party = Table('party', MetaData(), autoload_with=conn) # Delete person # Delete and reinsert as no upsert command is available stmt = t_party.delete().where(delete_stmt_decisions_with_df(df)) conn.execute(stmt) parties = json.loads(row['parties']) # create all plaintiffs offset = 0 for plaintiff in parties.get('plaintiffs'): # Insert person into the person table plaintiff_dict = {"name": plaintiff['name'].strip(), "is_natural_person": plaintiff['legal_type'] == 'natural person', "gender": plaintiff['gender']} stmt = t_person.insert().returning(column("person_id")).values([plaintiff_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] # Then insert the person into the party table stmt = t_party.insert().values( [{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 1 + offset}]) conn.execute(stmt) for representant in plaintiff.get('legal_counsel'): # Insert their representation into the person and party tables representant_dict = {"name": representant['name'].strip(), "is_natural_person": representant['legal_type'] == 'natural person', "gender": representant['gender']} stmt = t_person.insert().returning(text("person_id")).values([representant_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] stmt = t_party.insert().values([{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 3 + offset}]) conn.execute(stmt) # create all defendants offset = 1 for defendant in parties.get('defendants'): # Insert person into the person table defendant_dict = {"name": plaintiff['name'].strip(), "is_natural_person": plaintiff['legal_type'] == 'natural person', "gender": plaintiff['gender']} stmt = t_person.insert().returning(text("person_id")).values([defendant_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] # Then insert the person into the party table stmt = t_party.insert().values( [{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 1 + offset}]) conn.execute(stmt) for representant in defendant.get('legal_counsel'): # Insert their representation into the person and party tables representant_dict = {"name": representant['name'].strip(), "is_natural_person": representant['legal_type'] == 'natural person', "gender": representant['gender']} stmt = t_person.insert().returning(text("person_id")).values([representant_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] stmt = t_party.insert().values([{"decision_id": str(row['decision_id']), "person_id": person_id, "party_type_id": 3 + offset}]) conn.execute(stmt) with engine.connect() as conn: stmt = t_person.delete().where(text( f"NOT EXISTS (SELECT FROM judicial_person jp WHERE jp.person_id = person.person_id UNION SELECT FROM party WHERE party.person_id = person.person_id)")) conn.execute(stmt)
def save_data_to_database(self, df: pd.DataFrame, engine: Engine): with engine.connect() as conn: t_person = Table('person', MetaData(), autoload_with=conn) t_jud_person = Table('judicial_person', MetaData(), autoload_with=conn) # Delete and reinsert as no upsert command is available stmt = t_jud_person.delete().where( delete_stmt_decisions_with_df(df)) conn.execute(stmt) for _, row in df.iterrows(): if not 'court_composition' in row or row[ 'court_composition'] is None: continue court_composition: CourtComposition = row['court_composition'] president = court_composition.president if president: # create president person president_dict = { "name": president.name.strip(), "is_natural_person": True, "gender": president.gender.value[0] if president.gender else None } stmt = t_person.insert().returning( text("person_id")).values([president_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] person_dict = { "decision_id": str(row['decision_id']), "person_id": person_id, "judicial_person_type_id": 1, "is_president": True } stmt = t_jud_person.insert().values([person_dict]) conn.execute(stmt) # create all judges for judge in court_composition.judges: if judge == president: # President is already created above continue judge_dict = { "name": judge.name.strip(), "is_natural_person": True, "gender": judge.gender.value[0] if judge.gender else None } stmt = t_person.insert().returning( text("person_id")).values([judge_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] person_dict = { "decision_id": str(row['decision_id']), "person_id": person_id, "judicial_person_type_id": 1, "is_president": False } stmt = t_jud_person.insert().values([person_dict]) conn.execute(stmt) # create all clerks for clerk in court_composition.clerks: clerk_dict = { "name": clerk.name.strip(), "is_natural_person": True, "gender": clerk.gender.value[0] if clerk.gender else None } stmt = t_person.insert().returning( text("person_id")).values([clerk_dict]) person_id = conn.execute(stmt).fetchone()['person_id'] person_dict = { "decision_id": str(row['decision_id']), "person_id": person_id, "judicial_person_type_id": 2, "is_president": False } stmt = t_jud_person.insert().values([person_dict]) conn.execute(stmt) where = f"WHERE NOT EXISTS (SELECT 1 FROM judicial_person b WHERE a.person_id = b.person_id) " \ f"AND NOT EXISTS (SELECT 1 FROM party c WHERE a.person_id = c.person_id)" stmt = t_person.delete().where(text(where))