def update_from_cl_data(self):
     include_text_for_dict = {
         jur_name: True
         for jur_name in self.include_text_for
     }
     for jur_name in self.jurisdictions:
         include_text = include_text_for_dict.get(jur_name) or False
         Logger.info(
             f"Adding cluster data for jurisdiction {jur_name} to database..."
         )
         self.process_cluster_data(
             self.__get_resource_dir_path(CLUSTER_PATH, jur_name),
             jurisdiction=jur_name,
         )
         Logger.info(
             f"Adding opinion data for jurisdiction {jur_name} to database..."
         )
         self.process_opinion_data(
             self.__get_resource_dir_path(OPINION_PATH, jur_name),
             include_text=include_text,
             jurisdiction=jur_name,
         )
     Logger.info(f"Adding citation data to database...")
     self.process_citation_data(
         get_full_path(os.path.join(BASE_CL_DIR, CITATIONS_PATH)))
def role_dict_from_file() -> RoleDict:
    role_dict = {}  # Key: role identifier, Value: list of resource IDs with key's role
    with open(get_full_path(STRUCTURAL_ROLE_FILE_PATH), "r") as role_file:
        role_csv_reader = csv.reader(role_file)
        next(role_csv_reader)  # First row is header
        for resource_id, role_id in role_csv_reader:
            if role_id not in role_dict:
                role_dict[role_id] = []
            role_dict[role_id].append(resource_id)
    return role_dict
Beispiel #3
0
 def get_citation_csv(self):
     Logger.info("Downloading citations CSV...")
     tar_file_bytes = urllib.request.urlopen(
         f"{BASE_URL}/{REMOTE_CITATIONS_PATH}").read()
     Logger.info("Completed citations data download, extracting...")
     decompressed_file_path = get_full_path(
         os.path.join(BASE_CL_DIR, CITATIONS_PATH))
     file_contents = gzip.GzipFile(
         fileobj=io.BytesIO(tar_file_bytes)).read()
     with open(decompressed_file_path, "wb") as decompressed_file:
         decompressed_file.write(file_contents)
     Logger.info("Completed extraction of citations data...")
Beispiel #4
0
def create_citations_csv():
    with get_session() as s:
        citations = s.execute(
            select(
                Citation.citing_opinion_id, Citation.cited_opinion_id, Citation.depth
            )
        ).all()
    print("Fetched citations, writing to file...")
    with open(get_full_path(CITATION_CSV_PATH), "w", 1024 * 1024) as citation_file:
        csv_writer = csv.writer(citation_file)
        i = 0
        for citation in citations:
            csv_writer.writerow(citation)
            if i != 0 and i % 1000000 == 0:
                print(f"Completed {i} rows...")
            i += 1
 def __get_resource_dir_path(self, resource_type: str, jur_name: str):
     return get_full_path(os.path.join(BASE_CL_DIR, resource_type,
                                       jur_name))
Beispiel #6
0
    opinions = Opinion.select().order_by(Opinion.resource_id).limit(1000)
    dir_files = {
        os.fsdecode(filename) for filename in os.listdir(os.fsencode(opinions_dir))
    }
    while opinions.count() > 0:
        for opinion in opinions:
            json_filename = f"{opinion.resource_id}.json"
            if json_filename not in dir_files:
                print(f"Could not find json for opinion ID {opinion.resource_id}")
                continue
            json_file_path = os.path.join(opinions_dir, json_filename)
            with open(json_file_path, encoding="utf8", mode="r") as opinion_json_file:
                opinion_data = json.load(opinion_json_file)
                opinion.html_text = get_html_text(opinion_data)
                if opinion.html_text is None:
                    print(f"{opinion.resource_id} has no html_text")
        Opinion.bulk_update(opinions, fields=[Opinion.html_text], batch_size=100)
        batch += 1
        print(f"Finished adding text for {batch * 1000} opinions...")
        opinions = (
            Opinion.select()
            .order_by(Opinion.resource_id)
            .offset(batch * 1000)
            .limit(1000)
        )


if __name__ == "__main__":
    add_opinion_text_col()
    populate_opinion_text(get_full_path(r"data/scotus_opinions"))
def get_citation_iterable():
    csv_file = open(get_full_path(r"data/citations.csv"), "r")
    csv_reader = csv.reader(csv_file, delimiter=",")
    next(csv_reader)  # Skip header row
    return csv_reader
Beispiel #8
0
from db.peewee.models import *
from utils.io import get_full_path
import csv

CITATION_CSV_PATH = "data/citation_list.csv"

if __name__ == "__main__":
    citations = Citation.select(Citation.citing_opinion,
                                Citation.cited_opinion, Citation.depth)
    with open(get_full_path(CITATION_CSV_PATH), "w") as citation_file:
        csv_writer = csv.writer(citation_file)
        for citation in citations:
            csv_writer.writerow((citation.citing_opinion,
                                 citation.cited_opinion, citation.depth))
Beispiel #9
0
 def __get_folder_path(self, resource_type: str, jurisdiction: str):
     return get_full_path(
         os.path.join(BASE_CL_DIR, resource_type, jurisdiction))
Beispiel #10
0
    # Since there's only ~65,000 opinions, it's feasible to just load all the IDs into memory to avoid making
    # millions of DB queries.
    opinion_set = {o.resource_id for o in Opinion.select()}

    citation_records = []
    with open(citations_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        for row in csv_reader:
            try:
                integer_row = [int(cell) for cell in row]
                if integer_row[0] in opinion_set and integer_row[1] in opinion_set:
                    new_record = Citation(
                        citing_opinion=integer_row[0],
                        cited_opinion=integer_row[1],
                        depth=integer_row[2],
                    )
                    citation_records.append(new_record)
            except Exception as e:
                print(f"Failure on row {row}: {e}")
        with db.atomic():
            Citation.bulk_create(citation_records, batch_size=100)


if __name__ == "__main__":
    db.connect()
    create_db_tables()
    ingest_cluster_data(get_full_path(r"data/scotus_clusters/"))
    ingest_opinion_data(get_full_path(r"data/scotus_opinions/"))
    ingest_citation_data(get_full_path(r"data/citations.csv"))
    db.close()