def update_from_cl_data(self): include_text_for_dict = { jur_name: True for jur_name in self.include_text_for } for jur_name in self.jurisdictions: include_text = include_text_for_dict.get(jur_name) or False Logger.info( f"Adding cluster data for jurisdiction {jur_name} to database..." ) self.process_cluster_data( self.__get_resource_dir_path(CLUSTER_PATH, jur_name), jurisdiction=jur_name, ) Logger.info( f"Adding opinion data for jurisdiction {jur_name} to database..." ) self.process_opinion_data( self.__get_resource_dir_path(OPINION_PATH, jur_name), include_text=include_text, jurisdiction=jur_name, ) Logger.info(f"Adding citation data to database...") self.process_citation_data( get_full_path(os.path.join(BASE_CL_DIR, CITATIONS_PATH)))
def role_dict_from_file() -> RoleDict: role_dict = {} # Key: role identifier, Value: list of resource IDs with key's role with open(get_full_path(STRUCTURAL_ROLE_FILE_PATH), "r") as role_file: role_csv_reader = csv.reader(role_file) next(role_csv_reader) # First row is header for resource_id, role_id in role_csv_reader: if role_id not in role_dict: role_dict[role_id] = [] role_dict[role_id].append(resource_id) return role_dict
def get_citation_csv(self): Logger.info("Downloading citations CSV...") tar_file_bytes = urllib.request.urlopen( f"{BASE_URL}/{REMOTE_CITATIONS_PATH}").read() Logger.info("Completed citations data download, extracting...") decompressed_file_path = get_full_path( os.path.join(BASE_CL_DIR, CITATIONS_PATH)) file_contents = gzip.GzipFile( fileobj=io.BytesIO(tar_file_bytes)).read() with open(decompressed_file_path, "wb") as decompressed_file: decompressed_file.write(file_contents) Logger.info("Completed extraction of citations data...")
def create_citations_csv(): with get_session() as s: citations = s.execute( select( Citation.citing_opinion_id, Citation.cited_opinion_id, Citation.depth ) ).all() print("Fetched citations, writing to file...") with open(get_full_path(CITATION_CSV_PATH), "w", 1024 * 1024) as citation_file: csv_writer = csv.writer(citation_file) i = 0 for citation in citations: csv_writer.writerow(citation) if i != 0 and i % 1000000 == 0: print(f"Completed {i} rows...") i += 1
def __get_resource_dir_path(self, resource_type: str, jur_name: str): return get_full_path(os.path.join(BASE_CL_DIR, resource_type, jur_name))
opinions = Opinion.select().order_by(Opinion.resource_id).limit(1000) dir_files = { os.fsdecode(filename) for filename in os.listdir(os.fsencode(opinions_dir)) } while opinions.count() > 0: for opinion in opinions: json_filename = f"{opinion.resource_id}.json" if json_filename not in dir_files: print(f"Could not find json for opinion ID {opinion.resource_id}") continue json_file_path = os.path.join(opinions_dir, json_filename) with open(json_file_path, encoding="utf8", mode="r") as opinion_json_file: opinion_data = json.load(opinion_json_file) opinion.html_text = get_html_text(opinion_data) if opinion.html_text is None: print(f"{opinion.resource_id} has no html_text") Opinion.bulk_update(opinions, fields=[Opinion.html_text], batch_size=100) batch += 1 print(f"Finished adding text for {batch * 1000} opinions...") opinions = ( Opinion.select() .order_by(Opinion.resource_id) .offset(batch * 1000) .limit(1000) ) if __name__ == "__main__": add_opinion_text_col() populate_opinion_text(get_full_path(r"data/scotus_opinions"))
def get_citation_iterable(): csv_file = open(get_full_path(r"data/citations.csv"), "r") csv_reader = csv.reader(csv_file, delimiter=",") next(csv_reader) # Skip header row return csv_reader
from db.peewee.models import * from utils.io import get_full_path import csv CITATION_CSV_PATH = "data/citation_list.csv" if __name__ == "__main__": citations = Citation.select(Citation.citing_opinion, Citation.cited_opinion, Citation.depth) with open(get_full_path(CITATION_CSV_PATH), "w") as citation_file: csv_writer = csv.writer(citation_file) for citation in citations: csv_writer.writerow((citation.citing_opinion, citation.cited_opinion, citation.depth))
def __get_folder_path(self, resource_type: str, jurisdiction: str): return get_full_path( os.path.join(BASE_CL_DIR, resource_type, jurisdiction))
# Since there's only ~65,000 opinions, it's feasible to just load all the IDs into memory to avoid making # millions of DB queries. opinion_set = {o.resource_id for o in Opinion.select()} citation_records = [] with open(citations_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") for row in csv_reader: try: integer_row = [int(cell) for cell in row] if integer_row[0] in opinion_set and integer_row[1] in opinion_set: new_record = Citation( citing_opinion=integer_row[0], cited_opinion=integer_row[1], depth=integer_row[2], ) citation_records.append(new_record) except Exception as e: print(f"Failure on row {row}: {e}") with db.atomic(): Citation.bulk_create(citation_records, batch_size=100) if __name__ == "__main__": db.connect() create_db_tables() ingest_cluster_data(get_full_path(r"data/scotus_clusters/")) ingest_opinion_data(get_full_path(r"data/scotus_opinions/")) ingest_citation_data(get_full_path(r"data/citations.csv")) db.close()