def make_and_save(item, skipdupes=False, min_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '' ) # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum(non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % ( c, item['court_id'], item['docket'] )) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map ) panel = [find_person(n, item['court_id'], case_date=panel_date) for n in item['panel']] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster, panel, opinions) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = (date_argued) = ( date_reargued ) = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item["dates"]: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item["file"]) # special rule for Kentucky if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31): item["court_id"] = "kycourtapphigh" if min_dates is not None: if min_dates.get(item["court_id"]) is not None: if main_date >= min_dates[item["court_id"]]: print( main_date, "after", min_dates[item["court_id"]], " -- skipping.", ) return if start_dates is not None: if start_dates.get(item["court_id"]) is not None: if main_date <= start_dates[item["court_id"]]: print( main_date, "before court founding:", start_dates[item["court_id"]], " -- skipping.", ) return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item["court_id"], case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", docket_number=item["docket"] or "", ) # get citation objects in a list for addition to the cluster found_citations = [] for c in item["citations"]: found = get_citations(clean_text(c, ["html", "inline_whitespace"])) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item["docket"]: docket_no = item["docket"].lower() if "claim no." in docket_no: docket_no = docket_no.split("claim no.")[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, "") docket_no = docket_no.strip(".").strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for # months and a few trivial words) in the citation, # then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, "") num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate # a bad citation, then it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item["court_id"], item["docket"])) else: found_citations.extend(found.to_model()) cluster = OpinionCluster( judges=item.get("judges", "") or "", precedential_status=("Unpublished" if item["unpublished"] else "Published"), date_filed=main_date, case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", source="Z", attorneys=item["attorneys"] or "", posture=item["posture"] or "", ) panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"], panel_date) opinions = [] for i, opinion_info in enumerate(item["opinions"]): if opinion_info["author"] is None: author = None else: author = lookup_judge_by_last_name(opinion_info["author"], item["court_id"], panel_date) converted_text = convert_columbia_html(opinion_info["opinion"]) opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]] if opinion_type == Opinion.LEAD and i > 0: opinion_type = Opinion.ADDENDUM opinion = Opinion( author=author, per_curiam=opinion_info["per_curiam"], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info["sha1"], # This is surely not updated for the new S3 world. If you're # reading this, you'll need to update this code. local_path=opinion_info["local_path"], ) joined_by = lookup_judges_by_last_name_list(item["joining"], item["court_id"], panel_date) opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print("Duplicate. skipping.") else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for citation in found_citations: citation.cluster = cluster citation.save() for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) # special rule for Kentucky if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31): item['court_id'] = 'kycourtapphigh' if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return if start_dates is not None: if start_dates.get(item['court_id']) is not None: if main_date <= start_dates[item['court_id']]: print(main_date, 'before court founding:', start_dates[item['court_id']], ' -- skipping.') return docket = Docket(source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '') # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item['court_id'], item['docket'])) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map) panel = [ find_person(n, item['court_id'], case_date=panel_date) for n in item['panel'] ] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [ find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining'] ] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise