def test_all_required_keys_no_extra_keys(self): """Are all required keys present? Are there any keys present that shouldn't be? """ required_fields = [ 'cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations' ] optional_fields = ['publisher', 'notes', 'href'] all_fields = required_fields + optional_fields for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: # All required fields present? for required_field in required_fields: try: reporter_data[required_field] except KeyError: self.fail("Reporter '%s' lacks required field '%s'" % (reporter_abbv, required_field)) # No extra fields? for k in reporter_data.keys(): self.assertIn( k, all_fields, "Reporter '%s' has an unknown field '%s'" % (reporter_abbv, k))
def test_all_required_keys_no_extra_keys(self): """Are all required keys present? Are there any keys present that shouldn't be? """ required_fields = ['cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations'] optional_fields = ['publisher', 'notes', 'href'] all_fields = required_fields + optional_fields for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: # All required fields present? for required_field in required_fields: try: reporter_data[required_field] except KeyError: self.fail("Reporter '%s' lacks required field '%s'" % ( reporter_abbv, required_field )) # No extra fields? for k in reporter_data.keys(): self.assertIn( k, all_fields, "Reporter '%s' has an unknown field '%s'" % ( reporter_abbv, k ) )
def test_fields_tidy(self): """Do fields have any messiness? For example: - some punctuation is not allowed in some keys - spaces at beginning/end not allowed """ def cleaner(s): return re.sub(r"[^ 0-9a-zA-Z.,\-'&()]", "", s.strip()) msg = "Got bad punctuation in: %s" for reporter_abbv, reporter_list in REPORTERS.items(): self.assertEqual( reporter_abbv, cleaner(reporter_abbv), msg=msg % reporter_abbv ) for reporter_data in reporter_list: for k in reporter_data["editions"].keys(): self.assertEqual(cleaner(k), k, msg=msg % k) for k, v in reporter_data["variations"].items(): self.assertEqual(cleaner(k), k, msg=msg % k) self.assertEqual(cleaner(v), v, msg=msg % v) for s in emit_strings(REPORTERS): self.assertEqual( s.strip(), s, msg="Fields needs whitespace stripped: '%s'" % s )
def test_all_reporters_have_valid_cite_type(self): """Do all reporters have valid cite_type values?""" for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: self.assertIn( reporter_data['cite_type'], VALID_CITE_TYPES, "%s did not have a valid cite_type value" % reporter_abbv, )
def test_any_keys_missing_editions(self): """Have we added any new reporters that lack a matching edition?""" for r_name, r_items in REPORTERS.items(): # For each reporter for item in r_items: # and each book in each reporter self.assertIn( r_name, item['editions'], msg="Could not find edition for key: %s" % r_name )
def test_any_keys_missing_editions(self): """Have we added any new reporters that lack a matching edition?""" for r_name, r_items in REPORTERS.items(): # For each reporter for item in r_items: # and each book in each reporter self.assertIn(r_name, item['editions'], msg="Could not find edition for key: %s" % r_name)
def make_csv(): with open('reporters.csv', 'w') as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter['editions']) d['citation'] = cite d['name'] = reporter['name'] d['publisher'] = reporter.get('publisher', '') d['cite_type'] = reporter['cite_type'] d['mlz_jurisdictions'] = ", ".join(reporter['mlz_jurisdiction']) d['variations'] = ", ".join(reporter['variations'].keys()) d['href'] = reporter.get('href', '') d['notes'] = reporter.get('notes', '') out.writerow(d)
def test_all_required_keys_no_extra_keys(self): """Are all required keys present? Are there any keys present that shouldn't be? """ required_fields = [ "cite_type", "editions", "mlz_jurisdiction", "name", "variations", ] optional_fields = ["cite_format", "publisher", "notes", "href", "regexes", "examples"] all_fields = required_fields + optional_fields for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: # All required fields present? for required_field in required_fields: try: reporter_data[required_field] except KeyError: self.fail( "Reporter '%s' lacks required field '%s'" % (reporter_abbv, required_field) ) # No extra fields? for k in reporter_data.keys(): self.assertIn( k, all_fields, "Reporter '%s' has an unknown field '%s'" % (reporter_abbv, k), ) # No empty string values? for k, v in reporter_data.items(): if isinstance(v, str): self.assertTrue( v != "", msg="Field '%s' is empty in reporter '%s'" % (k, reporter_abbv), )
def make_csv(): with open("reporters.csv", "w") as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter["editions"]) d["citation"] = cite d["name"] = reporter["name"] d["publisher"] = reporter.get("publisher", "") d["cite_type"] = reporter["cite_type"] d["mlz_jurisdictions"] = ", ".join( reporter["mlz_jurisdiction"]) d["variations"] = ", ".join(reporter["variations"].keys()) d["href"] = reporter.get("href", "") d["notes"] = reporter.get("notes", "") out.writerow(d)
def make_csv(): with open('reporters.csv', 'w') as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter['editions']) d['citation'] = cite d['name'] = reporter['name'] d['publisher'] = reporter.get('publisher', '') d['cite_type'] = reporter['cite_type'] d['mlz_jurisdictions'] = ", ".join( reporter['mlz_jurisdiction']) d['variations'] = ", ".join(reporter['variations'].keys()) d['href'] = reporter.get('href', '') d['notes'] = reporter.get('notes', '') out.writerow(d)
def _populate_reporter_extractors(): """Populate EXTRACTORS and EDITIONS_LOOKUP.""" # Set up regex replacement variables from reporters-db raw_regex_variables = deepcopy(RAW_REGEX_VARIABLES) raw_regex_variables["full_cite"][""] = "$volume $reporter,? $page" raw_regex_variables["page"][""] = rf"(?P<page>{PAGE_NUMBER_REGEX})" regex_variables = process_variables(raw_regex_variables) def _substitute_edition(template, *edition_names): """Helper to replace $edition in template with edition_names.""" edition = "|".join(re.escape(e) for e in edition_names) return Template(template).safe_substitute(edition=edition) # Extractors step one: add an extractor for each reporter string # Build a lookup of regex -> edition. # Keys in this dict will be regular expressions to handle a # particular reporter string, like (simplified) # r"(?P<volume>\d+) (?P<reporter>U\.S\.) (?P<page>\d+)" editions_by_regex = defaultdict( # Values in this dict will be: lambda: { # Exact matches. If the regex is "\d+ U.S. \d+", # this will be [Edition("U.S.")] "editions": [], # Variants. If the regex matches "\d+ U. S. \d+", # this will be [Edition("U.S.")] "variations": [], # Strings a text must contain for this regex to match. # If the regex is "\d+ S.E. 2d \d+", # this will be {"S.E. 2d"} "strings": set(), # Whether this regex results in a short cite: "short": False, } ) def _add_regex( kind: str, reporters: List[str], edition: Edition, regex: str, ): """Helper to generate citations for a reporter and insert into editions_by_regex.""" for reporter in reporters: EDITIONS_LOOKUP[reporter].append(edition) editions_by_regex[regex][kind].append(edition) # add strings have_strings = re.escape(reporters[0]) in regex if have_strings: editions_by_regex[regex]["strings"].update(reporters) # add short cite short_cite_regex = short_cite_re(regex) if short_cite_regex != regex: editions_by_regex[short_cite_regex][kind].append(edition) editions_by_regex[short_cite_regex]["short"] = True if have_strings: editions_by_regex[short_cite_regex]["strings"].update( reporters ) def _add_regexes( regex_templates: List[str], edition_name: str, edition: Edition, variations: List[str], ): """Expand regex_templates and add to editions_by_regex.""" for regex_template in regex_templates: regex_template = recursive_substitute( regex_template, regex_variables ) regex = _substitute_edition(regex_template, edition_name) _add_regex("editions", [edition_name], edition, regex) if variations: regex = _substitute_edition(regex_template, *variations) _add_regex( "variations", variations, edition, regex, ) # add reporters.json: for source_key, source_cluster in REPORTERS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="reporters", ) variations = source["variations"] for edition_name, edition_data in source["editions"].items(): edition = Edition( short_name=edition_name, reporter=reporter_obj, start=edition_data["start"], end=edition_data["end"], ) regex_templates = edition_data.get("regexes") or ["$full_cite"] edition_variations = [ k for k, v in variations.items() if v == edition_name ] _add_regexes( regex_templates, edition_name, edition, edition_variations ) # add laws.json for source_key, source_cluster in LAWS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="laws", ) edition = Edition( short_name=source_key, reporter=reporter_obj, start=source["start"], end=source["end"], ) regex_templates = source.get("regexes") or ["$full_cite"] # handle citation to multiple sections, like # "Mass. Gen. Laws ch. 1, §§ 2-3": regex_templates = [ r.replace(r"§ ", r"§§? ?") for r in regex_templates ] _add_regexes( regex_templates, source_key, edition, source.get("variations", []), ) # add journals.json for source_key, source_cluster in JOURNALS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="journals", ) edition = Edition( short_name=source_key, reporter=reporter_obj, start=source["start"], end=source["end"], ) regex_templates = source.get("regexes") or ["$full_cite"] _add_regexes( regex_templates, source_key, edition, source.get("variations", []), ) # Add each regex to EXTRACTORS: for regex, cluster in editions_by_regex.items(): EXTRACTORS.append( TokenExtractor( nonalphanum_boundaries_re(regex), CitationToken.from_match, extra={ "exact_editions": cluster["editions"], "variation_editions": cluster["variations"], "short": cluster["short"], }, strings=list(cluster["strings"]), ) ) # Extractors step two: # Add a few one-off extractors to handle special token types # other than citations: EXTRACTORS.extend( [ # Id. TokenExtractor( ID_REGEX, IdToken.from_match, flags=re.I, strings=["id.", "ibid."], ), # supra TokenExtractor( SUPRA_REGEX, SupraToken.from_match, flags=re.I, strings=["supra"], ), # paragraph TokenExtractor( PARAGRAPH_REGEX, ParagraphToken.from_match, ), # case name stopwords TokenExtractor( STOP_WORD_REGEX, StopWordToken.from_match, flags=re.I, strings=STOP_WORDS, ), # tokens containing section symbols TokenExtractor( SECTION_REGEX, SectionToken.from_match, strings=["§"] ), ] )
def iter_reporters(): for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: yield reporter_abbv, reporter_list, reporter_data