def test_parse_charge_statute_incomplete(self): charge1 = '(32234 2a)' charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED' assert ScraperUtils.parse_charge_statute(charge1) == (None, '32234 2a') assert ScraperUtils.parse_charge_statute(charge2) == (charge2, None) assert ScraperUtils.parse_charge_statute(' ') == (None, None) assert ScraperUtils.parse_charge_statute(None) == (None, None)
def test_parse_charge_statute(self): charge1 = ' FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS) (3161935 3) ' charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED (32234 2a)' charge3 = ' FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION) (79023) ' charge4 = 'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED (320.0605(1)) ' assert ScraperUtils.parse_charge_statute(charge1) == ( 'FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS)', '3161935 3') assert ScraperUtils.parse_charge_statute(charge2) == ( 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED', '32234 2a') assert ScraperUtils.parse_charge_statute(charge3) == ( 'FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION)', '79023') assert ScraperUtils.parse_charge_statute(charge4) == ( 'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED', '320.0605(1)')
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError( 'Summary details did not load for case {}.'.format( case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError( 'Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath( '//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]" ) docket_attorney = driver.find_elements_by_xpath( "//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath( "//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') _id = str(uuid.uuid4()) _state = settings['state-code'] _county = settings['county'] CaseNum = summary_table_col2[1].text.strip() AgencyReportNum = summary_table_col1[4].text.strip() ArrestDate = None # Can't be found on this portal FilingDate = summary_table_col1[2].text.strip() OffenseDate = None # Can't be found on this portal DivisionName = summary_table_col3[3].text.strip() CaseStatus = summary_table_col3[1].text.strip() if settings['collect-pii']: # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) DefenseAttorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list( map(lambda x: x.text, docket_public_defender)) PublicDefender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge Judge = summary_table_col1[0].text.strip() # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if settings['save-attachments']: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath( './../../td[3]').text.strip() if settings['save-attachments'] == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf( driver, output_attachments, '{}-{}'.format(case_number, attachment_text), settings['portal-base'], attachment_link, 20, settings['verbose']) else: DefenseAttorney = [] PublicDefender = [] Judge = None Charges = {} for charge in charges_table: charge_cols = charge.find_elements_by_tag_name('td') count = int(charge_cols[0].text.strip()) charge_desc = charge_cols[1].text description, statute = ScraperUtils.parse_charge_statute(charge_desc) level = charge_cols[2].text.strip() degree = charge_cols[3].text.strip() # plea = charge_cols[4].text.strip() # Plea is not filled out on this portal. disposition = charge_cols[5].text.strip() disposition_date = charge_cols[6].text.strip() offense_date = None # Not shown on this portal citation_number = None # Not shown on this portal Charges[count] = Charge(count, statute, description, level, degree, disposition, disposition_date, offense_date, citation_number, None, None) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath( './../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers( plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date ArrestingOfficer = None # Can't be found on this portal ArrestingOfficerBadgeNumber = None # Can't be found on this portal profile_link = driver.find_element_by_xpath( "//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a" ).get_attribute('href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', settings['verbose']) Suffix = None DOB = None # This portal has DOB as N/A for every defendent Race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]' ).text.strip() Sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]' ).text.strip() FirstName = None MiddleName = None LastName = None PartyID = None # Only collect PII if configured if settings['collect-pii']: # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]' ).text.strip() MiddleName = None LastName = None if ',' in full_name: FirstName, MiddleName, LastName = ScraperUtils.parse_name( full_name) else: # If there's no comma, it's a corporation name. FirstName = full_name PartyID = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]' ).text.strip( ) # PartyID is a field within the portal system to uniquely identify defendants record = Record(_id, _state, _county, case_number, CaseNum, AgencyReportNum, PartyID, FirstName, MiddleName, LastName, Suffix, DOB, Race, Sex, ArrestDate, FilingDate, OffenseDate, DivisionName, CaseStatus, DefenseAttorney, PublicDefender, Judge, list(Charges.values()), ArrestingOfficer, ArrestingOfficerBadgeNumber) ScraperUtils.write_csv(output_file, record, settings['verbose'])
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Summary details did not load for case {}.'.format( case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath( '//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]" ) docket_attorney = driver.find_elements_by_xpath( "//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath( "//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') r = BenchmarkRecordBuilder() r.id = str(uuid.uuid4()) r.state = FLAGS.state r.county = FLAGS.county r.portal_id = case_number r.case_num = Pii.String(summary_table_col2[1].text.strip()) r.agency_report_num = summary_table_col1[4].text.strip() r.arrest_date = None # Can't be found on this portal r.filing_date = summary_table_col1[2].text.strip() r.offense_date = None # Can't be found on this portal r.division_name = summary_table_col3[3].text.strip() r.case_status = summary_table_col3[1].text.strip() # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) r.defense_attorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list(map(lambda x: x.text, docket_public_defender)) r.public_defender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge r.judge = Pii.String(summary_table_col1[0].text.strip()) # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if FLAGS.save_attachments: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath( './../../td[3]').text.strip() if FLAGS.save_attachments == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf( driver, output_attachments, '{}-{}'.format(case_number, attachment_text), FLAGS.portal_base, attachment_link, 20, FLAGS.verbose) Charges = {} for charge in charges_table: charge_builder = ChargeBuilder() charge_cols = charge.find_elements_by_tag_name('td') count = int(charge_cols[0].text.strip()) charge_builder.count = count charge_desc = charge_cols[1].text charge_builder.description, charge_builder.statute = ( ScraperUtils.parse_charge_statute(charge_desc)) charge_builder.level = charge_cols[2].text.strip() charge_builder.degree = charge_cols[3].text.strip() # plea = charge_cols[4].text.strip() # Plea is not filled out on this portal. charge_builder.disposition = charge_cols[5].text.strip() charge_builder.disposition_date = charge_cols[6].text.strip() Charges[count] = charge_builder.build() r.charges = list(Charges.values()) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath( './../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers( plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date r.arresting_officer = None # Can't be found on this portal r.arresting_officer_badge_number = None # Can't be found on this portal profile_link = driver.find_element_by_xpath( "//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a" ).get_attribute('href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', FLAGS.verbose) r.suffix = None r.dob = None # This portal has DOB as N/A for every defendent r.race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]' ).text.strip() r.sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]' ).text.strip() # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]' ).text.strip() r.middle_name = None r.last_name = None if ',' in full_name: r.first_name, r.middle_name, r.last_name = ScraperUtils.parse_name( full_name) else: # If there's no comma, it's a corporation name. r.first_name = Pii.String(full_name) r.party_id = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]' ).text.strip( ) # PartyID is a field within the portal system to uniquely identify defendants record = r.build() ScraperUtils.write_csv(FLAGS.output, record, FLAGS.verbose)