def get_judge(html, case_path=None): path = '//p[position() <= 60]//text()[not(parent::span)][not(ancestor::center)][not(ancestor::i)]' text_elements = html.xpath(path) # Get the first paragraph that starts with two uppercase letters after we've stripped out any star pagination. judge = None for t in text_elements: t = clean_string(t) judge, reason = get_judge_from_str(t) if judge: break if reason == 'TOO_LONG': # We've begun doing paragraphs... break if not judge: try: judge = fixes[case_path]['judge'] except KeyError: if 'input_judge' in DEBUG: subprocess.Popen( ['firefox', 'file://%s' % case_path], shell=False).communicate() judge = raw_input("No judge identified! What should be here? ") add_fix(case_path, {'judge': judge}) if 'log_bad_judges' in DEBUG: with open('missing_judges.txt', 'a') as out: out.write('%s\n' % case_path) if 'judge' in DEBUG: log_print(' Judge: %s' % judge) return judge
def get_judge(html, case_path=None): path = '//p[position() <= 60]//text()[not(parent::span)][not(ancestor::center)][not(ancestor::i)]' text_elements = html.xpath(path) # Get the first paragraph that starts with two uppercase letters after we've stripped out any star pagination. judge = None for t in text_elements: t = clean_string(t) judge, reason = get_judge_from_str(t) if judge: break if reason == 'TOO_LONG': # We've begun doing paragraphs... break if not judge: try: judge = fixes[case_path]['judge'] except KeyError: if 'input_judge' in DEBUG: subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() judge = raw_input("No judge identified! What should be here? ") add_fix(case_path, {'judge': judge}) if 'log_bad_judges' in DEBUG: with open('missing_judges.txt', 'a') as out: out.write('%s\n' % case_path) if 'judge' in DEBUG: log_print(' Judge: %s' % judge) return judge
def test_extracting_judges_from_strings_lawbox(self): pairs = ( ('The following is the order of Judge Brailsford', (u'Brailsford', REASONS[12])), ('Before INGRAHAM, Circuit Judge, and SEALS and COWAN, District ' 'Judges.', (u'Ingraham, Circuit Judge, and Seals and Cowan, District Judges', REASONS[14])), ('J. H. Reddy, Chattanooga, Tenn., James F. Neal, John J. Hooker, ' 'Sr., Special Atty., Nashville, Tenn., Charles W. Shaffer, Jr., ' 'Dept. of Justice, Washington, D. C., for the United States.', (None, REASONS[10])), ('MR. JUSTICE CLARK delivered the opinion of the Court.', (u'Clark', REASONS[5])), ('Justice THEIS delivered the judgment of the court, with opinion.', (u'Theis', REASONS[5])), ('Kennedy, J., announced the judgment of the Court and delivered ' 'the opinion of the Court, except...', (u'Kennedy', REASONS[5])), ('Kendy, J., announced the judgment of the Court ', (u'Kendy', REASONS[5])), ('U.S.C. 22, JUSTICE Eats Apples', (None, REASONS[3])), # Has a judiciary word, but not at the end. ('PER CURIAM', (u'Per Curiam', REASONS[6])), ('Per Curiam', (u'Per Curiam', REASONS[6])), ('L. CHANDLER WATSON, Jr., Bankruptcy Judge.', (u'L. Chandler Watson, Jr.', REASONS[7])), ('VOLINN, Bankruptcy Judge:', (u'Volinn', REASONS[7])), ('McGOVERN, District Judge.', (u'McGovern', REASONS[7])), ('JOHN TeSELLE, Bankruptcy Judge.', (u'John Teselle', REASONS[7])), ('LEAPHART, Justice', (u'Leaphart', REASONS[7])), ('SIMPSON, C.J.', (u'Simpson', REASONS[7])), ('LANSING, Judge.', (u'Lansing', REASONS[7])), ('BRAUN, PLAINTIFF, Kendrick, Finkbeiner, Schafer & Murphy (by ' 'Michael J. W. Horn), for defendants.', (None, REASONS[4])), ('OPINION BY MR. JUSTICE JONES, May 25953', (u'Mr. Justice Jones', REASONS[8])), ('Opinion by Justice ROSS', (u'Ross', REASONS[8])), ('SPENCE, J.', (u'Spence', REASONS[7])), ('Spencer, J.,', (u'Spencer', REASONS[7])), ('SPENCE', (None, REASONS[9])), ('Nourse, P. J.', (u'Nourse', REASONS[7])), ('A. SPENCE, J.', (u'A. Spence', REASONS[7])), ('Van SICKLE, District Judge.', (u'Van Sickle', REASONS[7])), ('VanSICKLE, District Judge.', (u'Vansickle', REASONS[7])), ('LeGRAND, Justice.', (u'Legrand', REASONS[7])), ('DAVID R. STRAWBRIDGE; United States Magistrate Judge.', (u'David R. Strawbridge', REASONS[7])), ('CARRICO, J., delivered the opinion of the court.', (u'Carrico', REASONS[5])), ('Justice HARTMAN delivered the opinion of the court', (u'Hartman', REASONS[5])), ('Justice APPLETON delivered the opinion of the court', (u'Appleton', REASONS[5])), ('The opinion of the Court was delivered by HANDLER, J.', (u'Handler', REASONS[11])), ('The following is the order of Judge Brailsford', (u'Brailsford', REASONS[12])), ('Before: NEFF, P.J., and MICHAEL J. KELLY and HOOD, JJ.', (u'Neff, P.J., and Michael J. Kelly and Hood', REASONS[14])), ('Chief Judge FULD', (u'Fuld', REASONS[15])), ('FOTH, C.', (u'Foth', REASONS[7])), ('Robert L. KRECHEVSKY, Bankruptcy Judge.', (u'Robert L. Krechevsky', REASONS[7])), ('Ernstrom & Dreste, Rochester, NY (J. William Ernstrom, of ' 'counsel), for Northland Associates, Inc.', (None, REASONS[10])), # memorandum looks like a bad_word, but it's not ('BREITEL and Judge JASEN, GABRIELLI, JONES, WACHTLER and COOKE ' 'Concur in Memorandum', (u'Breitel and Judge Jasen, Gabrielli, Jones, Wachtler and Cooke ' u'Concur in Memorandum', REASONS[16])), # but if it starts with Memorandum, it's no good. ('Memorandum of Decision on R.C. Allen Instruments', (None, REASONS[10])), ('CONCLUDING That the Aggravating Circumstances Outweighed the ' 'Mitigating Circumstances.', (None, REASONS[4])), ('Considering Factor (A), "The Ultimate and Decisive Test," We ' 'Examine Factors (E), (F) and (H)', (None, REASONS[10])), ('Decision Denying Application to Retain Rebecca J. Habbert', (None, REASONS[10])), ("Accepting Appellant's Pleas of Guilty, the Record Reflects the " "Following Occurred:", (None, REASONS[10])), ('ADDRESSING Ourselves to the Substance of These Questions We ' 'Think It Appropriate', (None, REASONS[4])), ('ADMITTING a Statement as a Dying Declaration, the Trial Court ' 'Must Make a Preliminary', (None, REASONS[4])), ('AMENDED Findings of Fact', (None, REASONS[4])), ('AMICUS Curiae Brief Was Filed by Bruce A. Olsen', (None, REASONS[4])), ('LAWRENCE S. Robbins Argued the Cause for Appellants. With Him', (None, REASONS[4])), ('DISCUSSING These Cases We Must Separate Them According to The', (None, REASONS[4])), ('EXAMINING These and the Other Defenses Which Comdisco Has ' 'Raised, However', (None, REASONS[4])), ('GOING Into the Question of the Public', (None, REASONS[4])), # Going is bad, but foregoing is good ('JUDGE BLATCHFORD After Stating the Facts in the Foregoing ' 'Language', (u'Judge Blatchford', REASONS[7])), ('DECISION Granting Judgment to the Trustee in Bankruptcy for ' 'Comprehensive Business Systems', (None, REASONS[4])), ('THESE Arguments That Both Sides Would Be Allowed Wide Latitude ' 'in Arguing', (None, REASONS[4])), ('DECISION Denying Application to Retain Rebecca J. Habbert', (None, REASONS[4])), ('TRIAL, Appellant Argued That It Was a Third-Party Benefic', (None, REASONS[4])), ('FINDINGS of Fact and Conclusion of Law on Eastgroup', (None, REASONS[4])), ('PROCEEDING Further a General Description of the Area Will Be ' 'Helpful', (None, REASONS[4])), ('TURNING Them Over to His Counsel on the Morning of July 24', (None, REASONS[4])), # Starting with a number. ('1975, SECTION 594 Did Not Describe What Kind judge ', (None, REASONS[3])), # Starting with a regex special char ('("DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s Fees', (None, REASONS[3])), (':"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('>"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('["DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('{"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('}"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), # Starts with "The", but is a valid form ('The Cause Was Argued Before Anderson', (u'Anderson', REASONS[17]) ), # Lowercase 'the' is no good, however ('the Water Heater Was Installed, the Slates, j.', (None, REASONS[18])), # Starting with "There " is no good, but "Theresa" is ('THERE is No Merit in the Claim of Improper Comment of the ' 'Commonwealth, J.', (None, REASONS[3])), ('THERESA CRAFT, J.', (u'Theresa Craft', REASONS[7])), # Nothing with utf-8 as first char is good. (u'\xe2\xa7\xe2\xa7 19-1-102(1), JUDGE', (None, REASONS[2])), # Argued Before is ok, but Argued is not. ('Argued before Lissner', (u'Lissner', REASONS[17])), ('ARGUED: amy louise howe, before so-and-so, Justice', (None, REASONS[3])), ) for q, a in pairs: self.assertEqual(tuple(get_judge_from_str(q)), a)
def test_extracting_judges_from_strings_lawbox(self): pairs = ( ('The following is the order of Judge Brailsford', (u'Brailsford', REASONS[12])), ('Before INGRAHAM, Circuit Judge, and SEALS and COWAN, District ' 'Judges.', (u'Ingraham, Circuit Judge, and Seals and Cowan, District Judges', REASONS[14])), ('J. H. Reddy, Chattanooga, Tenn., James F. Neal, John J. Hooker, ' 'Sr., Special Atty., Nashville, Tenn., Charles W. Shaffer, Jr., ' 'Dept. of Justice, Washington, D. C., for the United States.', (None, REASONS[10])), ('MR. JUSTICE CLARK delivered the opinion of the Court.', (u'Clark', REASONS[5])), ('Justice THEIS delivered the judgment of the court, with opinion.', (u'Theis', REASONS[5])), ('Kennedy, J., announced the judgment of the Court and delivered ' 'the opinion of the Court, except...', (u'Kennedy', REASONS[5])), ('Kendy, J., announced the judgment of the Court ', (u'Kendy', REASONS[5])), ('U.S.C. 22, JUSTICE Eats Apples', (None, REASONS[3])), # Has a judiciary word, but not at the end. ('PER CURIAM', (u'Per Curiam', REASONS[6])), ('Per Curiam', (u'Per Curiam', REASONS[6])), ('L. CHANDLER WATSON, Jr., Bankruptcy Judge.', (u'L. Chandler Watson, Jr.', REASONS[7])), ('VOLINN, Bankruptcy Judge:', (u'Volinn', REASONS[7])), ('McGOVERN, District Judge.', (u'McGovern', REASONS[7])), ('JOHN TeSELLE, Bankruptcy Judge.', (u'John Teselle', REASONS[7])), ('LEAPHART, Justice', (u'Leaphart', REASONS[7])), ('SIMPSON, C.J.', (u'Simpson', REASONS[7])), ('LANSING, Judge.', (u'Lansing', REASONS[7])), ('BRAUN, PLAINTIFF, Kendrick, Finkbeiner, Schafer & Murphy (by ' 'Michael J. W. Horn), for defendants.', (None, REASONS[4])), ('OPINION BY MR. JUSTICE JONES, May 25953', (u'Mr. Justice Jones', REASONS[8])), ('Opinion by Justice ROSS', (u'Ross', REASONS[8])), ('SPENCE, J.', (u'Spence', REASONS[7])), ('Spencer, J.,', (u'Spencer', REASONS[7])), ('SPENCE', (None, REASONS[9])), ('Nourse, P. J.', (u'Nourse', REASONS[7])), ('A. SPENCE, J.', (u'A. Spence', REASONS[7])), ('Van SICKLE, District Judge.', (u'Van Sickle', REASONS[7])), ('VanSICKLE, District Judge.', (u'Vansickle', REASONS[7])), ('LeGRAND, Justice.', (u'Legrand', REASONS[7])), ('DAVID R. STRAWBRIDGE; United States Magistrate Judge.', (u'David R. Strawbridge', REASONS[7])), ('CARRICO, J., delivered the opinion of the court.', (u'Carrico', REASONS[5])), ('Justice HARTMAN delivered the opinion of the court', (u'Hartman', REASONS[5])), ('Justice APPLETON delivered the opinion of the court', (u'Appleton', REASONS[5])), ('The opinion of the Court was delivered by HANDLER, J.', (u'Handler', REASONS[11])), ('The following is the order of Judge Brailsford', (u'Brailsford', REASONS[12])), ('Before: NEFF, P.J., and MICHAEL J. KELLY and HOOD, JJ.', (u'Neff, P.J., and Michael J. Kelly and Hood', REASONS[14])), ('Chief Judge FULD', (u'Fuld', REASONS[15])), ('FOTH, C.', (u'Foth', REASONS[7])), ('Robert L. KRECHEVSKY, Bankruptcy Judge.', (u'Robert L. Krechevsky', REASONS[7])), ('Ernstrom & Dreste, Rochester, NY (J. William Ernstrom, of ' 'counsel), for Northland Associates, Inc.', (None, REASONS[10])), # memorandum looks like a bad_word, but it's not ('BREITEL and Judge JASEN, GABRIELLI, JONES, WACHTLER and COOKE ' 'Concur in Memorandum', (u'Breitel and Judge Jasen, Gabrielli, Jones, Wachtler and Cooke ' u'Concur in Memorandum', REASONS[16])), # but if it starts with Memorandum, it's no good. ('Memorandum of Decision on R.C. Allen Instruments', (None, REASONS[10])), ('CONCLUDING That the Aggravating Circumstances Outweighed the ' 'Mitigating Circumstances.', (None, REASONS[4])), ('Considering Factor (A), "The Ultimate and Decisive Test," We ' 'Examine Factors (E), (F) and (H)', (None, REASONS[10])), ('Decision Denying Application to Retain Rebecca J. Habbert', (None, REASONS[10])), ("Accepting Appellant's Pleas of Guilty, the Record Reflects the " "Following Occurred:", (None, REASONS[10])), ('ADDRESSING Ourselves to the Substance of These Questions We ' 'Think It Appropriate', (None, REASONS[4])), ('ADMITTING a Statement as a Dying Declaration, the Trial Court ' 'Must Make a Preliminary', (None, REASONS[4])), ('AMENDED Findings of Fact', (None, REASONS[4])), ('AMICUS Curiae Brief Was Filed by Bruce A. Olsen', (None, REASONS[4])), ('LAWRENCE S. Robbins Argued the Cause for Appellants. With Him', (None, REASONS[4])), ('DISCUSSING These Cases We Must Separate Them According to The', (None, REASONS[4])), ('EXAMINING These and the Other Defenses Which Comdisco Has ' 'Raised, However', (None, REASONS[4])), ('GOING Into the Question of the Public', (None, REASONS[4])), # Going is bad, but foregoing is good ('JUDGE BLATCHFORD After Stating the Facts in the Foregoing ' 'Language', (u'Judge Blatchford', REASONS[7])), ('DECISION Granting Judgment to the Trustee in Bankruptcy for ' 'Comprehensive Business Systems', (None, REASONS[4])), ('THESE Arguments That Both Sides Would Be Allowed Wide Latitude ' 'in Arguing', (None, REASONS[4])), ('DECISION Denying Application to Retain Rebecca J. Habbert', (None, REASONS[4])), ('TRIAL, Appellant Argued That It Was a Third-Party Benefic', (None, REASONS[4])), ('FINDINGS of Fact and Conclusion of Law on Eastgroup', (None, REASONS[4])), ('PROCEEDING Further a General Description of the Area Will Be ' 'Helpful', (None, REASONS[4])), ('TURNING Them Over to His Counsel on the Morning of July 24', (None, REASONS[4])), # Starting with a number. ('1975, SECTION 594 Did Not Describe What Kind judge ', (None, REASONS[3])), # Starting with a regex special char ('("DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s Fees', (None, REASONS[3])), (':"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('>"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('["DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('{"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), ('}"DGCL") SEEKING judge Advancement of Reasonable Attorney\'s ' 'Fees', (None, REASONS[3])), # Starts with "The", but is a valid form ('The Cause Was Argued Before Anderson', (u'Anderson', REASONS[17])), # Lowercase 'the' is no good, however ('the Water Heater Was Installed, the Slates, j.', (None, REASONS[18])), # Starting with "There " is no good, but "Theresa" is ('THERE is No Merit in the Claim of Improper Comment of the ' 'Commonwealth, J.', (None, REASONS[3])), ('THERESA CRAFT, J.', (u'Theresa Craft', REASONS[7])), # Nothing with utf-8 as first char is good. (u'\xe2\xa7\xe2\xa7 19-1-102(1), JUDGE', (None, REASONS[2])), # Argued Before is ok, but Argued is not. ('Argued before Lissner', (u'Lissner', REASONS[17])), ('ARGUED: amy louise howe, before so-and-so, Justice', (None, REASONS[3])), ) for q, a in pairs: self.assertEqual(tuple(get_judge_from_str(q)), a)