def test_extract_citations(case_factory, tmpdir, settings, elasticsearch): from scripts.extract_cites import EDITIONS as processed_editions settings.MISSED_CITATIONS_DIR = str(tmpdir) blocked_by_date = set( k for k in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()) if all(c['start_year'] > 2000 for c in processed_editions[k])) legitimate_cites = [ "225 F. Supp. 552", # correct ["125 f supp 152", "125 F. Supp. 152"], # normalized ["125 Burnett (Wis.) 152", "125 Bur. 152"], # normalized ["1 F. 2d 2", "1 F.2d 2"], # not matched as "1 F. 2" "2 1/2 Mass. 1", # special volume numbers "3 Suppl. Mass. 2", # special volume numbers "1 La.App. 5 Cir. 2", # not matched as "1 La.App. 5" "2000 WL 12345", # vendor cite ] legitimate_cites += [ "1 %s 1" % c for c in EDITIONS.keys() if c not in blocked_by_date ] legitimate_cites += [["1 %s 1" % k, "1 %s 1" % v] for k, vv in VARIATIONS_ONLY.items() for v in vv if k not in blocked_by_date] legitimate_cites_normalized = set( normalize_cite(c if type(c) is str else c[1]) for c in legitimate_cites) legitimate_cites = [ c if type(c) is str else c[0] for c in legitimate_cites ] illegitimate_cites = [ "2 Dogs 3", # unrecognized reporter "3 Dogs 4", # duplicate unrecognized reporter "1 or 2", # not matched as 1 Or. 2 "word1 Mass. 2word", # not matched if part of larger word "1 Mass.\n 2", # no match across newlines "1 A.3d 1", # no match to reporter that started publishing in 2010 ] illegitimate_cites += ["1 %s 1" % c for c in blocked_by_date] case = case_factory( body_cache__text=", some text, ".join(legitimate_cites + illegitimate_cites), decision_date=datetime(2000, 1, 1)) fabfile.extract_all_citations() update_elasticsearch_from_queue() # check extracted cites cites = list(ExtractedCitation.objects.all()) cite_set = set(c.cite for c in cites) normalized_cite_set = set(c.normalized_cite for c in cites) assert cite_set == set(legitimate_cites) assert normalized_cite_set == legitimate_cites_normalized assert all(c.cited_by_id == case.pk for c in cites)
def test_case_cited_by(client, case_factory, tmpdir, settings, elasticsearch): settings.MISSED_CITATIONS_DIR = str(tmpdir) dest_case = case_factory() dest_cite = dest_case.citations.first() source_cases = [case_factory(body_cache__text=dest_cite.cite) for _ in range(2)] non_citing_case = case_factory() fabfile.extract_all_citations() update_elasticsearch_from_queue() response = client.get(reverse('case_cited_by', args=[dest_case.pk], host='cite')) check_response( response, content_includes=[c.name_abbreviation for c in source_cases], content_excludes=[non_citing_case.name_abbreviation] )
def test_filter_case_cite_by(client, extracted_citation_factory, case_factory, elasticsearch): search_url = api_reverse("cases-list") cases = [case_factory() for _ in range(4)] case_cited = case_factory(citations__cite='1 Mass. 1') for c in cases[:-1]: extracted_citation_factory(cited_by=c, cite='1 Mass. 1') update_elasticsearch_from_queue() # get cases by cites_to=citation content = client.get(search_url, {"cites_to": '1 Mass. 1'}).json() assert set(case['id'] for case in content['results']) == set(c.id for c in cases[:-1]) # get cases by cites_to=id content = client.get(search_url, {"cites_to": case_cited.id}).json() assert set(case['id'] for case in content['results']) == set(c.id for c in cases[:-1])
def test_extract_citations(case_factory, elasticsearch): legitimate_cites = [ "225 F. Supp. 552", # correct ["1 F Supp 1", "1 F. Supp. 1"], # normalized ["2 F.-'Supp.- 2", "2 F. Supp. 2"], # extra cruft matched by get_cite_extractor ["125 Yt. 152", "125 Vt. 152"], # custom reporters added by patch_reporters_db ["125 Burnett (Wis.) 152", "125 Bur. 152"], # normalized ["1 F. 2d 2", "1 F.2d 2"], # not matched as "1 F. 2" "2000 WL 12345", # vendor cite ["3 F Supp, at 3", "3 F. Supp. 3"], # short cite ] legitimate_cites_normalized = set( normalize_cite(c if type(c) is str else c[1]) for c in legitimate_cites) legitimate_cites = [ c if type(c) is str else c[0] for c in legitimate_cites ] illegitimate_cites = [ "2 Dogs 3", # unrecognized reporter "3 Dogs 4", # duplicate unrecognized reporter "1 or 2", # not matched as 1 Or. 2 "125 f. supp.-' 152", # limit on how much cruft matched by get_cite_extractor -- only 2 at end ] case = case_factory(decision_date=datetime(2000, 1, 1)) case_text = ", some text, ".join(legitimate_cites + illegitimate_cites) set_case_text(case, case_text) case.sync_case_body_cache() update_elasticsearch_from_queue() # check extracted cites cites = list(ExtractedCitation.objects.all()) cite_set = set(c.cite for c in cites) normalized_cite_set = set(c.rdb_normalized_cite for c in cites) assert cite_set == set(legitimate_cites) assert normalized_cite_set == legitimate_cites_normalized assert all(c.cited_by_id == case.pk for c in cites)
check_timestamps_unchanged(case, timestamp) # deleting obj.delete() timestamp = check_timestamps_changed(case, timestamp) # updating outbound references for obj, change_field, no_change_field in ( (case.reporter, 'full_name', 'notes'), (case.court, 'name', 'none'), (case.volume, 'volume_number', 'publisher'), (case.jurisdiction, 'name', 'none'), ): # updating tracked field setattr(obj, change_field, 'foo') obj.save() timestamp = check_timestamps_changed(case, timestamp) # updated untracked field setattr(obj, no_change_field, 'foo') obj.save() check_timestamps_unchanged(case, timestamp) # case gets removed when in_scope changes update_elasticsearch_from_queue() CaseDocument.get(case.pk) case.duplicative = True case.save() update_elasticsearch_from_queue() CaseDocument.get(case.pk)