def test_parse_defense_attorneys_invalid(self): invalid_test = ['', '', ''] invalid_test2 = [] invalid_test3 = None assert ScraperUtils.parse_attorneys(invalid_test) is None assert ScraperUtils.parse_attorneys(invalid_test2) is None assert ScraperUtils.parse_attorneys(invalid_test3) is None
def test_parse_defense_attorneys(self): attorneys1 = [ 'DEFENSE ATTORNEY: DOE, JANE EMILY ASSIGNED', 'DEFENSE ATTORNEY: DOE, JOHN MICHAEL ASSIGNED', 'DEFENSE ATTORNEY: SELF, SELF ASSIGNED' ] public_defenders1 = [ 'COURT APPOINTED ATTORNEY: DOE, JOHN MICHAEL ASSIGNED' ] assert ScraperUtils.parse_attorneys(attorneys1) == [ 'DOE, JANE EMILY', 'DOE, JOHN MICHAEL', 'SELF, SELF' ] assert ScraperUtils.parse_attorneys(public_defenders1) == [ 'DOE, JOHN MICHAEL' ]
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError('Summary details did not load for case {}.'.format(case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(settings['connect-thresh']): try: WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == settings['connect-thresh'] - 1: raise RuntimeError('Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath('//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]") docket_attorney = driver.find_elements_by_xpath("//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath("//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') _id = str(uuid.uuid4()) _state = settings['state-code'] _county = settings['county'] CaseNum = summary_table_col2[1].text.strip() AgencyReportNum = summary_table_col1[4].text.strip() ArrestDate = None # Can't be found on this portal FilingDate = summary_table_col1[2].text.strip() OffenseDate = None # Can't be found on this portal DivisionName = summary_table_col3[3].text.strip() CaseStatus = summary_table_col3[1].text.strip() if settings['collect-pii']: # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) DefenseAttorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list(map(lambda x: x.text, docket_public_defender)) PublicDefender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge Judge = summary_table_col1[0].text.strip() # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if settings['save-attachments']: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath('./../../td[3]').text.strip() if settings['save-attachments'] == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf(driver, output_attachments, '{}-{}'.format(case_number, attachment_text), settings['portal-base'], attachment_link, 20, settings['verbose']) else: DefenseAttorney = [] PublicDefender = [] Judge = None Charges = {} for charge in charges_table: charge_details = charge.find_elements_by_tag_name('td') count = int(charge_details[0].text.strip()) long_desc = charge_details[1].text.strip() # Statute is contained within brackets if '(' in long_desc and ')' in long_desc: statute = long_desc[long_desc.find('(') + 1:long_desc.find(')')] else: statute = None description = long_desc.split('(')[0] level = charge_details[2].text.strip() degree = charge_details[3].text.strip() # plea = charge_details[4].text.strip() # Plea is not filled out on this portal. disposition = charge_details[5].text.strip() disposition_date = charge_details[6].text.strip() offense_date = None # Not shown on this portal citation_number = None # Not shown on this portal Charges[count] = Charge(count, statute, description, level, degree, disposition, disposition_date, offense_date, citation_number, None, None) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath('./../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers(plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date ArrestingOfficer = None # Can't be found on this portal ArrestingOfficerBadgeNumber = None # Can't be found on this portal profile_link = driver.find_element_by_xpath("//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a").get_attribute( 'href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', settings['verbose']) Suffix = None DOB = None # This portal has DOB as N/A for every defendent Race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]').text.strip() Sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]').text.strip() FirstName = None MiddleName = None LastName = None PartyID = None # Only collect PII if configured if settings['collect-pii']: # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]').text.strip() MiddleName = None LastName = None if ',' in full_name: name_split = full_name.split(',')[1].lstrip().split() FirstName = name_split[0] MiddleName = " ".join(name_split[1:]) LastName = full_name.split(',')[0] else: # If there's no comma, it's a corporation name. FirstName = full_name PartyID = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]').text.strip() # PartyID is a field within the portal system to uniquely identify defendants record = Record(_id, _state, _county, case_number, CaseNum, AgencyReportNum, PartyID, FirstName, MiddleName, LastName, Suffix, DOB, Race, Sex, ArrestDate, FilingDate, OffenseDate, DivisionName, CaseStatus, DefenseAttorney, PublicDefender, Judge, list(Charges.values()), ArrestingOfficer, ArrestingOfficerBadgeNumber) ScraperUtils.write_csv(output_file, record, settings['verbose'])
def scrape_record(case_number): """ Scrapes a record once the case has been opened. :param case_number: The current case's case number. """ # Wait for court summary to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'summaryAccordion'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Summary details did not load for case {}.'.format( case_number)) else: driver.refresh() # Get relevant page content summary_table_col1 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd') summary_table_col2 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd') summary_table_col3 = driver.find_elements_by_xpath( '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd') # Wait for court dockets to load for i in range(FLAGS.connect_thresh): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.ID, 'gridDocketsView'))) except TimeoutException: if i == FLAGS.connect_thresh - 1: raise RuntimeError( 'Dockets did not load for case {}.'.format(case_number)) else: driver.refresh() charges_table = driver.find_elements_by_xpath( '//*[@id="gridCharges"]/tbody/tr') docket_public_defender = driver.find_elements_by_xpath( "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]" ) docket_attorney = driver.find_elements_by_xpath( "//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]") docket_pleas = driver.find_elements_by_xpath( "//*[contains(text(), 'PLEA OF')]") docket_attachments = driver.find_elements_by_class_name('casedocketimage') r = BenchmarkRecordBuilder() r.id = str(uuid.uuid4()) r.state = FLAGS.state r.county = FLAGS.county r.portal_id = case_number r.case_num = Pii.String(summary_table_col2[1].text.strip()) r.agency_report_num = summary_table_col1[4].text.strip() r.arrest_date = None # Can't be found on this portal r.filing_date = summary_table_col1[2].text.strip() r.offense_date = None # Can't be found on this portal r.division_name = summary_table_col3[3].text.strip() r.case_status = summary_table_col3[1].text.strip() # Create list of assigned defense attorney(s) defense_attorney_text = list(map(lambda x: x.text, docket_attorney)) r.defense_attorney = ScraperUtils.parse_attorneys(defense_attorney_text) # Create list of assigned public defenders / appointed attorneys public_defender_text = list(map(lambda x: x.text, docket_public_defender)) r.public_defender = ScraperUtils.parse_attorneys(public_defender_text) # Get Judge r.judge = Pii.String(summary_table_col1[0].text.strip()) # Download docket attachments. # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'. if FLAGS.save_attachments: for attachment_link in docket_attachments: attachment_text = attachment_link.find_element_by_xpath( './../../td[3]').text.strip() if FLAGS.save_attachments == 'filing': if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text): # Attachment is not a filing, don't download it. continue ScraperUtils.save_attached_pdf( driver, output_attachments, '{}-{}'.format(case_number, attachment_text), FLAGS.portal_base, attachment_link, 20, FLAGS.verbose) Charges = {} for charge in charges_table: charge_builder = ChargeBuilder() charge_cols = charge.find_elements_by_tag_name('td') count = int(charge_cols[0].text.strip()) charge_builder.count = count charge_desc = charge_cols[1].text charge_builder.description, charge_builder.statute = ( ScraperUtils.parse_charge_statute(charge_desc)) charge_builder.level = charge_cols[2].text.strip() charge_builder.degree = charge_cols[3].text.strip() # plea = charge_cols[4].text.strip() # Plea is not filled out on this portal. charge_builder.disposition = charge_cols[5].text.strip() charge_builder.disposition_date = charge_cols[6].text.strip() Charges[count] = charge_builder.build() r.charges = list(Charges.values()) # Pleas are not in the 'plea' field, but instead in the dockets. for plea_element in docket_pleas: plea_text = plea_element.text.strip() plea = ScraperUtils.parse_plea_type(plea_text) plea_date = plea_element.find_element_by_xpath( './../td[2]').text.strip() plea_number = ScraperUtils.parse_plea_case_numbers( plea_text, list(Charges.keys())) # If no case number is specified in the plea, then we assume it applies to all charges in the trial. if len(plea_number) == 0: for charge in Charges.values(): charge.plea = plea charge.plea_date = plea_date else: # Apply plea to relevant charge count(s). for count in plea_number: Charges[count].plea = plea Charges[count].plea_date = plea_date r.arresting_officer = None # Can't be found on this portal r.arresting_officer_badge_number = None # Can't be found on this portal profile_link = driver.find_element_by_xpath( "//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a" ).get_attribute('href') # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute( # 'href') load_page(profile_link, 'Party Details:', FLAGS.verbose) r.suffix = None r.dob = None # This portal has DOB as N/A for every defendent r.race = driver.find_element_by_xpath( '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]' ).text.strip() r.sex = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]' ).text.strip() # Navigate to party profile full_name = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]' ).text.strip() r.middle_name = None r.last_name = None if ',' in full_name: r.first_name, r.middle_name, r.last_name = ScraperUtils.parse_name( full_name) else: # If there's no comma, it's a corporation name. r.first_name = Pii.String(full_name) r.party_id = driver.find_element_by_xpath( '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]' ).text.strip( ) # PartyID is a field within the portal system to uniquely identify defendants record = r.build() ScraperUtils.write_csv(FLAGS.output, record, FLAGS.verbose)