def test_parse_plea_type_nolo_contendere(self): plea1 = 'PLEA OF NOLO CONTENDERE' plea2 = "DEFENDANT ENTERED PLEA OF : NOLO-CONTENDERE SEQ 2" plea3 = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5" assert ScraperUtils.parse_plea_type(plea1) == 'Nolo Contendere' assert ScraperUtils.parse_plea_type(plea2) == 'Nolo Contendere' assert ScraperUtils.parse_plea_type(plea3) == 'Nolo Contendere'
def test_parse_plea_type_not_guilty(self): plea1 = 'PLEA OF NOT GUILTY' plea2 = "PLEA OF NOT GUILTY/DENIAL, WAIVER OF ARRAIGNMENT, DEMAND FOR NOTICE OF EXPERT TESTIMONY, DEMAND FOR DISCOVERY, DEMAND FOR STATEMENT OF PARTICULARS, DEMAND FOR JURY TRIAL, DESIGNATION OF E-MAIL ADDRESSES PURSUANT TO RULE 2.516 1/28/2020" plea3 = 'EP - NOTICE OF APPEARANCE, AND ENTRY OF CONDITIONAL PLEA OF NOT GUILTY AND DEMAND FOR JURY TRIAL' assert ScraperUtils.parse_plea_type(plea1) == 'Not Guilty' assert ScraperUtils.parse_plea_type(plea2) == 'Not Guilty' assert ScraperUtils.parse_plea_type(plea3) == 'Not Guilty'
def test_parse_defense_attorneys_invalid(self): invalid_test = ['', '', ''] invalid_test2 = [] invalid_test3 = None assert ScraperUtils.parse_attorneys(invalid_test) is None assert ScraperUtils.parse_attorneys(invalid_test2) is None assert ScraperUtils.parse_attorneys(invalid_test3) is None
def test_parse_charge_statute_incomplete(self): charge1 = '(32234 2a)' charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED' assert ScraperUtils.parse_charge_statute(charge1) == (None, '32234 2a') assert ScraperUtils.parse_charge_statute(charge2) == (charge2, None) assert ScraperUtils.parse_charge_statute(' ') == (None, None) assert ScraperUtils.parse_charge_statute(None) == (None, None)
def test_parse_charge_statute(self): charge1 = ' FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS) (3161935 3) ' charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED (32234 2a)' charge3 = ' FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION) (79023) ' charge4 = 'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED (320.0605(1)) ' assert ScraperUtils.parse_charge_statute(charge1) == ( 'FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS)', '3161935 3') assert ScraperUtils.parse_charge_statute(charge2) == ( 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED', '32234 2a') assert ScraperUtils.parse_charge_statute(charge3) == ( 'FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION)', '79023') assert ScraperUtils.parse_charge_statute(charge4) == ( 'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED', '320.0605(1)')
def test_parse_defense_attorneys(self): attorneys1 = [ 'DEFENSE ATTORNEY: DOE, JANE EMILY ASSIGNED', 'DEFENSE ATTORNEY: DOE, JOHN MICHAEL ASSIGNED', 'DEFENSE ATTORNEY: SELF, SELF ASSIGNED' ] public_defenders1 = [ 'COURT APPOINTED ATTORNEY: DOE, JOHN MICHAEL ASSIGNED' ] assert ScraperUtils.parse_attorneys(attorneys1) == [ 'DOE, JANE EMILY', 'DOE, JOHN MICHAEL', 'SELF, SELF' ] assert ScraperUtils.parse_attorneys(public_defenders1) == [ 'DOE, JOHN MICHAEL' ]
def test_parse_out_path_valid(self): # Function should not affect valid length filenames and paths. normal_filename = 'document' parsedPath = ScraperUtils.parse_out_path('C:\\Example\Path', normal_filename, 'pdf') assert parsedPath == os.path.join( 'C:\\Example\Path', '{}.{}'.format(normal_filename, 'pdf'))
def begin_scrape(): """ Starts the scraping process. Continues from the last scraped record if the scraper was stopped before. :return: """ global driver # Find the progress of any past scraping runs to continue from then try: last_case_number = ScraperUtils.get_last_csv_row(output_file).split(',')[3] print("Continuing from last scrape (Case number: {})".format(last_case_number)) last_year = 2000 + int(str(last_case_number)[:2]) # I know there's faster ways of doing this. It only runs once ;) if not last_case_number.isnumeric(): last_case_number = last_case_number[:-4] last_case = int(str(last_case_number)[-6:]) settings['end-year'] = last_year continuing = True except FileNotFoundError: # No existing scraping CSV continuing = False pass # Scrape from the most recent year to the oldest. for year in range(settings['end-year'], settings['start-year'], -1): if continuing: N = last_case + 1 else: N = 1 print("Scraping year {} from case {}".format(year, N)) YY = year % 100 record_missing_count = 0 # Increment case numbers until the threshold missing cases is met, then advance to the next year. while record_missing_count < settings['missing-thresh']: # Generate the case number to scrape case_number = f'{YY:02}' + f'{N:06}' search_result = search_portal(case_number) if search_result: record_missing_count = 0 # if multiple associated cases are found, # scrape all of them if len(search_result) > 1: for case in search_result: search_portal(case) scrape_record(case) # only a single case, no multiple associated cases found else: scrape_record(case_number) else: record_missing_count += 1 N += 1 continuing = False print("Scraping for year {} is complete".format(year))
def test_parse_out_path_shortening(self): # 260 characters long before the extension. This is an invalid filename in Windows. filename_too_long = '01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789' parsed_path = ScraperUtils.parse_out_path(os.getcwd(), filename_too_long, 'txt') try: open(parsed_path, 'w') os.remove(parsed_path) except OSError: pytest.fail('parse_out_path() generates an invalid file path.')
def test_parse_plea_case_numbers_no_charge_numbers(self): plea = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5" assert ScraperUtils.parse_plea_case_numbers(plea, []) == []
def test_parse_out_path_illegal_characters(self): filename_invalid_chars = 't<>:e"/s\\t|?n*ame' assert ScraperUtils.parse_out_path('', filename_invalid_chars, 'pdf') == os.path.join( '', 'testname.pdf')
def test_parse_plea_case_numbers_blank(self): assert ScraperUtils.parse_plea_case_numbers("", [1, 2, 3]) == []
def test_filename_blank(self): filename = None parsed_path = ScraperUtils.parse_out_path(r'C:\\Example\\Path', filename, 'pdf') assert parsed_path == os.path.join(r'C:\\Example\\Path', '.pdf')
def test_parse_name(self): name1 = "DOE, JANE EMILY" name2 = "DOE, JOHN" assert ScraperUtils.parse_name(name1) == ('JANE', 'EMILY', 'DOE') assert ScraperUtils.parse_name(name2) == ('JOHN', None, 'DOE')
def test_parse_plea_type_guilty(self): plea1 = 'PLEA OF GUILTY' plea2 = 'DEFENDANT ENTERED PLEA OF : GUILTY SEQ 2' assert ScraperUtils.parse_plea_type(plea1) == 'Guilty' assert ScraperUtils.parse_plea_type(plea2) == 'Guilty'
def test_parse_name_error(self): name1 = None name2 = '' assert ScraperUtils.parse_name(name1) == (None, None, None) assert ScraperUtils.parse_name(name2) == (None, None, None)
def search_portal(case_number): """ Performs a search of the portal from its home page, including selecting the case number input, solving the captcha and pressing Search. Also handles the captcha being solved incorrectly :param case_number: Case to search :return: A set of case number(s). """ # Load portal search page load_page(f"{settings['portal-base']}/Home.aspx/Search", 'Search', settings['verbose']) # Give some time for the captcha to load, as it does not load instantly. time.sleep(0.8) # Select Case Number textbox and enter case number select_case_input() case_input = driver.find_element_by_id('caseNumber') case_input.click() case_input.send_keys(case_number) # Solve captcha if it is required try: # Get Captcha. This is kinda nasty, but if there's no Captcha, then # this will throw (which is a good thing in this case) and we can # move on with processing. captcha_image_elem = driver.find_element_by_xpath( '//*/img[@alt="Captcha"]') captcha_buffer = captcha_image_elem.screenshot_as_png if settings['solve-captchas']: solved_captcha = captcha_solver.solve_captcha(captcha_buffer) captcha_textbox = driver.find_element_by_xpath( '//*/input[@name="captcha"]') captcha_textbox.click() captcha_textbox.send_keys(solved_captcha.answer) # Do search search_button = driver.find_element_by_id('searchButton') search_button.click() else: print(f"Captcha encountered trying to view case ID {case_number}.") print( "Please solve the captcha and click the search button to proceed." ) while True: try: WebDriverWait( driver, 6 * 60 * 60).until(lambda x: case_number in driver.title) print("continuing...") break except TimeoutException: print("still waiting for user to solve the captcha...") except NoSuchElementException: # No captcha on the page, continue. solved_captcha = None # Do search search_button = driver.find_element_by_id('searchButton') search_button.click() # If the title stays as 'Search': Captcha solving failed # If the title contains the case number or 'Search Results': Captcha solving succeeded # If a timeout occurs, retry 'connect-thresh' times. for i in range(settings['connect-thresh']): try: # Wait for page to load WebDriverWait( driver, 5).until(lambda x: 'Search' in driver.title or case_number in driver.title or 'Search Results:' in driver.title) # Page loaded if driver.title == 'Search': # Clicking search did not change the page. This could be because of a failed captcha attempt. try: # Check if 'Invalid Captcha' dialog is showing driver.find_element_by_xpath( '//div[@class="alert alert-error"]') print("Captcha was solved incorrectly") if settings['solve-captchas'] and solved_captcha: solved_captcha.notify_incorrect() except NoSuchElementException: pass # Clear cookies so a new captcha is presented upon refresh driver.delete_all_cookies() # Try solving the captcha again. search_portal(case_number) elif 'Search Results: CaseNumber:' in driver.title: # Captcha solved correctly if settings['solve-captchas'] and solved_captcha: solved_captcha.notify_correct() case_count = ScraperUtils.get_search_case_count( driver, settings['county']) # Case number search found multiple cases. if case_count > 1: return ScraperUtils.get_associated_cases(driver) # Case number search found no cases else: return set() elif case_number in driver.title: # Captcha solved correctly if settings['solve-captchas'] and solved_captcha: solved_captcha.notify_correct() # Case number search did find a single court case. return {case_number} except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError( 'Case page could not be loaded after {} attempts, or unexpected page title: {}' .format(settings['connect-thresh'], driver.title)) else: search_portal(case_number)
def search_portal(case_number): """ Performs a search of the portal from its home page, including selecting the case number input, solving the captcha and pressing Search. Also handles the captcha being solved incorrectly :param case_number: Case to search :return: A set of case number(s). """ # Load portal search page load_page(f"{settings['portal-base']}/Home.aspx/Search", 'Search', settings['verbose']) # Give some time for the captcha to load, as it does not load instantly. time.sleep(0.8) # Select Case Number textbox and enter case number select_case_input() case_input = driver.find_element_by_id('caseNumber') case_input.click() case_input.send_keys(case_number) if settings['solve-captchas']: # Solve captcha if it is required try: # Get Captcha captcha_image_elem = driver.find_element_by_xpath( '//*/img[@alt="Captcha"]') captcha_buffer = captcha_image_elem.screenshot_as_png captcha_answer = captcha_solver.solve_captcha(captcha_buffer) captcha_textbox = driver.find_element_by_xpath( '//*/input[@name="captcha"]') captcha_textbox.click() captcha_textbox.send_keys(captcha_answer) except NoSuchElementException: # No captcha on the page, continue. pass # Do search search_button = driver.find_element_by_id('searchButton') search_button.click() else: raise Exception("Automated captcha solving is disabled by default. Please seek advice before using this feature.") # If the title stays as 'Search': Captcha solving failed # If the title contains the case number or 'Search Results': Captcha solving succeeded # If a timeout occurs, retry 'connect-thresh' times. for i in range(settings['connect-thresh']): try: # Wait for page to load WebDriverWait(driver, 5).until( lambda x: 'Search' in driver.title or case_number in driver.title or 'Search Results:' in driver.title) # Page loaded if driver.title == 'Search': # Clicking search did not change the page. This could be because of a failed captcha attempt. try: # Check if 'Invalid Captcha' dialog is showing driver.find_element_by_xpath( '//div[@class="alert alert-error"]') print("Captcha was solved incorrectly") captcha_solver.notify_last_captcha_fail() except NoSuchElementException: pass # Clear cookies so a new captcha is presented upon refresh driver.delete_all_cookies() # Try solving the captcha again. search_portal(case_number) elif 'Search Results: CaseNumber:' in driver.title: # Captcha solved correctly captcha_solver.notify_last_captcha_success() # Figure out the numer of cases returned case_detail_tbl = driver.find_element_by_tag_name('table').text.split('\n') case_count_idx = case_detail_tbl.index('CASES FOUND') + 1 case_count = int(case_detail_tbl[case_count_idx]) # Case number search found multiple cases. if case_count > 1: return ScraperUtils.get_associated_cases(driver) # Case number search found no cases else: return set() elif case_number in driver.title: # Captcha solved correctly captcha_solver.notify_last_captcha_success() # Case number search did find a single court case. return {case_number} except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError('Case page could not be loaded after {} attempts, or unexpected page title: {}'.format(settings['connect-thresh'], driver.title)) else: search_portal(case_number)
def test_parse_plea_case_numbers__no_charge_mentioned(self): plea = "PLEA OF NOT GUILTY" assert ScraperUtils.parse_plea_case_numbers(plea, [1]) == []
def test_parse_plea_case_numbers_one_charge_mentioned(self): plea = "DEFENDANT ENTERED PLEA OF : NOLO-CONTENDERE SEQ 2" assert ScraperUtils.parse_plea_case_numbers(plea, [1, 2]) == [2]
def test_parse_out_path_correct_length(self): # 260 characters long before the extension. This is an invalid filename in Windows. filename_too_long = '01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789' parsed_path = ScraperUtils.parse_out_path(os.getcwd(), filename_too_long, 'txt') assert len(parsed_path) <= 256
def test_parse_out_path_filename_extension_shortening(self): # 252 characters long, but with the .pdf extension it becomes 256 characters long - one too many. filename = '012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901' parsed_filename = ScraperUtils.parse_out_path('', filename, 'pdf') assert parsed_filename == '{}.{}'.format(filename[:-1], 'pdf')
def test_parse_plea_case_numbers_multiple_case_numbers(self): plea = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5" assert ScraperUtils.parse_plea_case_numbers( plea, [1, 2, 3, 4, 5, 6]) == [1, 2, 3, 4, 5]
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError('Summary details did not load for case {}.'.format(case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError('Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath('//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]") docket_attorney = driver.find_elements_by_xpath("//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath("//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') _id = str(uuid.uuid4()) _state = settings['state-code'] _county = settings['county'] CaseNum = summary_table_col2[1].text.strip() AgencyReportNum = summary_table_col1[4].text.strip() ArrestDate = None # Can't be found on this portal FilingDate = summary_table_col1[2].text.strip() OffenseDate = None # Can't be found on this portal DivisionName = summary_table_col3[3].text.strip() CaseStatus = summary_table_col3[1].text.strip() if settings['collect-pii']: # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) DefenseAttorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list(map(lambda x: x.text, docket_public_defender)) PublicDefender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge Judge = summary_table_col1[0].text.strip() # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if settings['save-attachments']: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath('./../../td[3]').text.strip() if settings['save-attachments'] == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf(driver, output_attachments, '{}-{}'.format(case_number, attachment_text), settings['portal-base'], attachment_link, 20, settings['verbose']) else: DefenseAttorney = [] PublicDefender = [] Judge = None Charges = {} for charge in charges_table: charge_details = charge.find_elements_by_tag_name('td') count = int(charge_details[0].text.strip()) long_desc = charge_details[1].text.strip() # Statute is contained within brackets if '(' in long_desc and ')' in long_desc: statute = long_desc[long_desc.find('(') + 1:long_desc.find(')')] else: statute = None description = long_desc.split('(')[0] level = charge_details[2].text.strip() degree = charge_details[3].text.strip() # plea = charge_details[4].text.strip() # Plea is not filled out on this portal. disposition = charge_details[5].text.strip() disposition_date = charge_details[6].text.strip() offense_date = None # Not shown on this portal citation_number = None # Not shown on this portal Charges[count] = Charge(count, statute, description, level, degree, disposition, disposition_date, offense_date, citation_number, None, None) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath('./../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers(plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date ArrestingOfficer = None # Can't be found on this portal ArrestingOfficerBadgeNumber = None # Can't be found on this portal profile_link = driver.find_element_by_xpath("//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a").get_attribute( 'href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', settings['verbose']) Suffix = None DOB = None # This portal has DOB as N/A for every defendent Race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]').text.strip() Sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]').text.strip() FirstName = None MiddleName = None LastName = None PartyID = None # Only collect PII if configured if settings['collect-pii']: # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]').text.strip() MiddleName = None LastName = None if ',' in full_name: name_split = full_name.split(',')[1].lstrip().split() FirstName = name_split[0] MiddleName = " ".join(name_split[1:]) LastName = full_name.split(',')[0] else: # If there's no comma, it's a corporation name. FirstName = full_name PartyID = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]').text.strip() # PartyID is a field within the portal system to uniquely identify defendants record = Record(_id, _state, _county, case_number, CaseNum, AgencyReportNum, PartyID, FirstName, MiddleName, LastName, Suffix, DOB, Race, Sex, ArrestDate, FilingDate, OffenseDate, DivisionName, CaseStatus, DefenseAttorney, PublicDefender, Judge, list(Charges.values()), ArrestingOfficer, ArrestingOfficerBadgeNumber) ScraperUtils.write_csv(output_file, record, settings['verbose'])
def test_parse_plea_case_numbers_messy(self): # Test a really ugly plea docket I found in one case plea = "PLEA OF NOT GUILTY/DENIAL, WAIVER OF ARRAIGNMENT, DEMAND FOR NOTICE OF EXPERT TESTIMONY, DEMAND FOR DISCOVERY, DEMAND FOR STATEMENT OF PARTICULARS, DEMAND FOR JURY TRIAL, DESIGNATION OF E-MAIL ADDRESSES PURSUANT TO RULE 2.516 1/28/2020" assert ScraperUtils.parse_plea_case_numbers(plea, [1, 2, 3]) == []
def test_parse_plea_type_blank(self): assert ScraperUtils.parse_plea_type('') is None
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Summary details did not load for case {}.'.format( case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath( '//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]" ) docket_attorney = driver.find_elements_by_xpath( "//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath( "//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') r = BenchmarkRecordBuilder() r.id = str(uuid.uuid4()) r.state = FLAGS.state r.county = FLAGS.county r.portal_id = case_number r.case_num = Pii.String(summary_table_col2[1].text.strip()) r.agency_report_num = summary_table_col1[4].text.strip() r.arrest_date = None # Can't be found on this portal r.filing_date = summary_table_col1[2].text.strip() r.offense_date = None # Can't be found on this portal r.division_name = summary_table_col3[3].text.strip() r.case_status = summary_table_col3[1].text.strip() # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) r.defense_attorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list(map(lambda x: x.text, docket_public_defender)) r.public_defender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge r.judge = Pii.String(summary_table_col1[0].text.strip()) # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if FLAGS.save_attachments: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath( './../../td[3]').text.strip() if FLAGS.save_attachments == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf( driver, output_attachments, '{}-{}'.format(case_number, attachment_text), FLAGS.portal_base, attachment_link, 20, FLAGS.verbose) Charges = {} for charge in charges_table: charge_builder = ChargeBuilder() charge_cols = charge.find_elements_by_tag_name('td') count = int(charge_cols[0].text.strip()) charge_builder.count = count charge_desc = charge_cols[1].text charge_builder.description, charge_builder.statute = ( ScraperUtils.parse_charge_statute(charge_desc)) charge_builder.level = charge_cols[2].text.strip() charge_builder.degree = charge_cols[3].text.strip() # plea = charge_cols[4].text.strip() # Plea is not filled out on this portal. charge_builder.disposition = charge_cols[5].text.strip() charge_builder.disposition_date = charge_cols[6].text.strip() Charges[count] = charge_builder.build() r.charges = list(Charges.values()) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath( './../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers( plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date r.arresting_officer = None # Can't be found on this portal r.arresting_officer_badge_number = None # Can't be found on this portal profile_link = driver.find_element_by_xpath( "//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a" ).get_attribute('href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', FLAGS.verbose) r.suffix = None r.dob = None # This portal has DOB as N/A for every defendent r.race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]' ).text.strip() r.sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]' ).text.strip() # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]' ).text.strip() r.middle_name = None r.last_name = None if ',' in full_name: r.first_name, r.middle_name, r.last_name = ScraperUtils.parse_name( full_name) else: # If there's no comma, it's a corporation name. r.first_name = Pii.String(full_name) r.party_id = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]' ).text.strip( ) # PartyID is a field within the portal system to uniquely identify defendants record = r.build() ScraperUtils.write_csv(FLAGS.output, record, FLAGS.verbose)