def test_parse_plea_type_nolo_contendere(self):
     plea1 = 'PLEA OF NOLO CONTENDERE'
     plea2 = "DEFENDANT ENTERED PLEA OF : NOLO-CONTENDERE SEQ 2"
     plea3 = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5"
     assert ScraperUtils.parse_plea_type(plea1) == 'Nolo Contendere'
     assert ScraperUtils.parse_plea_type(plea2) == 'Nolo Contendere'
     assert ScraperUtils.parse_plea_type(plea3) == 'Nolo Contendere'
 def test_parse_plea_type_not_guilty(self):
     plea1 = 'PLEA OF NOT GUILTY'
     plea2 = "PLEA OF NOT GUILTY/DENIAL, WAIVER OF ARRAIGNMENT, DEMAND FOR NOTICE OF EXPERT TESTIMONY, DEMAND FOR DISCOVERY, DEMAND FOR STATEMENT OF PARTICULARS, DEMAND FOR JURY TRIAL, DESIGNATION OF E-MAIL ADDRESSES PURSUANT TO RULE 2.516 1/28/2020"
     plea3 = 'EP - NOTICE OF APPEARANCE, AND ENTRY OF CONDITIONAL PLEA OF NOT GUILTY AND DEMAND FOR JURY TRIAL'
     assert ScraperUtils.parse_plea_type(plea1) == 'Not Guilty'
     assert ScraperUtils.parse_plea_type(plea2) == 'Not Guilty'
     assert ScraperUtils.parse_plea_type(plea3) == 'Not Guilty'
Esempio n. 3
0
 def test_parse_defense_attorneys_invalid(self):
     invalid_test = ['', '', '']
     invalid_test2 = []
     invalid_test3 = None
     assert ScraperUtils.parse_attorneys(invalid_test) is None
     assert ScraperUtils.parse_attorneys(invalid_test2) is None
     assert ScraperUtils.parse_attorneys(invalid_test3) is None
Esempio n. 4
0
 def test_parse_charge_statute_incomplete(self):
     charge1 = '(32234 2a)'
     charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED'
     assert ScraperUtils.parse_charge_statute(charge1) == (None, '32234 2a')
     assert ScraperUtils.parse_charge_statute(charge2) == (charge2, None)
     assert ScraperUtils.parse_charge_statute(' ') == (None, None)
     assert ScraperUtils.parse_charge_statute(None) == (None, None)
Esempio n. 5
0
 def test_parse_charge_statute(self):
     charge1 = '	FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS) (3161935 3)  '
     charge2 = 'DRIVING WHILE LICENSE SUSPENDED OR REVOKED (32234 2a)'
     charge3 = '	FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION) (79023)  '
     charge4 = 'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED (320.0605(1))  '
     assert ScraperUtils.parse_charge_statute(charge1) == (
         'FLEEING OR ATTEMPTING TO ELUDE (HIGH SPEED RECKLESS)',
         '3161935 3')
     assert ScraperUtils.parse_charge_statute(charge2) == (
         'DRIVING WHILE LICENSE SUSPENDED OR REVOKED', '32234 2a')
     assert ScraperUtils.parse_charge_statute(charge3) == (
         'FELON IN POSSESSION OF AMMUNITION (ACTUAL POSSESSION)', '79023')
     assert ScraperUtils.parse_charge_statute(charge4) == (
         'FAIL TO DISPLAY REGISTRATION - POSSESSION REQUIRED',
         '320.0605(1)')
 def test_parse_defense_attorneys(self):
     attorneys1 = [
         'DEFENSE ATTORNEY: DOE, JANE EMILY ASSIGNED',
         'DEFENSE ATTORNEY: DOE, JOHN MICHAEL ASSIGNED',
         'DEFENSE ATTORNEY: SELF, SELF ASSIGNED'
     ]
     public_defenders1 = [
         'COURT APPOINTED ATTORNEY: DOE, JOHN MICHAEL ASSIGNED'
     ]
     assert ScraperUtils.parse_attorneys(attorneys1) == [
         'DOE, JANE EMILY', 'DOE, JOHN MICHAEL', 'SELF, SELF'
     ]
     assert ScraperUtils.parse_attorneys(public_defenders1) == [
         'DOE, JOHN MICHAEL'
     ]
 def test_parse_out_path_valid(self):
     # Function should not affect valid length filenames and paths.
     normal_filename = 'document'
     parsedPath = ScraperUtils.parse_out_path('C:\\Example\Path',
                                              normal_filename, 'pdf')
     assert parsedPath == os.path.join(
         'C:\\Example\Path', '{}.{}'.format(normal_filename, 'pdf'))
def begin_scrape():
    """
    Starts the scraping process. Continues from the last scraped record if the scraper was stopped before.
    :return:
    """
    global driver

    # Find the progress of any past scraping runs to continue from then
    try:
        last_case_number = ScraperUtils.get_last_csv_row(output_file).split(',')[3]
        print("Continuing from last scrape (Case number: {})".format(last_case_number))
        last_year = 2000 + int(str(last_case_number)[:2])  # I know there's faster ways of doing this. It only runs once ;)
        if not last_case_number.isnumeric():
            last_case_number = last_case_number[:-4]
        last_case = int(str(last_case_number)[-6:])
        settings['end-year'] = last_year
        continuing = True
    except FileNotFoundError:
        # No existing scraping CSV
        continuing = False
        pass

    # Scrape from the most recent year to the oldest.
    for year in range(settings['end-year'], settings['start-year'], -1):
        if continuing:
            N = last_case + 1
        else:
            N = 1

        print("Scraping year {} from case {}".format(year, N))
        YY = year % 100

        record_missing_count = 0
        # Increment case numbers until the threshold missing cases is met, then advance to the next year.
        while record_missing_count < settings['missing-thresh']:
            # Generate the case number to scrape
            case_number = f'{YY:02}' + f'{N:06}'

            search_result = search_portal(case_number)
            if search_result:
                record_missing_count = 0
                # if multiple associated cases are found,
                # scrape all of them
                if len(search_result) > 1:
                    for case in search_result:
                        search_portal(case)
                        scrape_record(case)
                # only a single case, no multiple associated cases found
                else:
                    scrape_record(case_number)
            else:
                record_missing_count += 1

            N += 1

        continuing = False

        print("Scraping for year {} is complete".format(year))
 def test_parse_out_path_shortening(self):
     # 260 characters long before the extension. This is an invalid filename in Windows.
     filename_too_long = '01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789'
     parsed_path = ScraperUtils.parse_out_path(os.getcwd(),
                                               filename_too_long, 'txt')
     try:
         open(parsed_path, 'w')
         os.remove(parsed_path)
     except OSError:
         pytest.fail('parse_out_path() generates an invalid file path.')
 def test_parse_plea_case_numbers_no_charge_numbers(self):
     plea = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5"
     assert ScraperUtils.parse_plea_case_numbers(plea, []) == []
 def test_parse_out_path_illegal_characters(self):
     filename_invalid_chars = 't<>:e"/s\\t|?n*ame'
     assert ScraperUtils.parse_out_path('', filename_invalid_chars,
                                        'pdf') == os.path.join(
                                            '', 'testname.pdf')
 def test_parse_plea_case_numbers_blank(self):
     assert ScraperUtils.parse_plea_case_numbers("", [1, 2, 3]) == []
Esempio n. 13
0
 def test_filename_blank(self):
     filename = None
     parsed_path = ScraperUtils.parse_out_path(r'C:\\Example\\Path',
                                               filename, 'pdf')
     assert parsed_path == os.path.join(r'C:\\Example\\Path', '.pdf')
Esempio n. 14
0
 def test_parse_name(self):
     name1 = "DOE, JANE EMILY"
     name2 = "DOE, JOHN"
     assert ScraperUtils.parse_name(name1) == ('JANE', 'EMILY', 'DOE')
     assert ScraperUtils.parse_name(name2) == ('JOHN', None, 'DOE')
 def test_parse_plea_type_guilty(self):
     plea1 = 'PLEA OF GUILTY'
     plea2 = 'DEFENDANT ENTERED PLEA OF : GUILTY SEQ 2'
     assert ScraperUtils.parse_plea_type(plea1) == 'Guilty'
     assert ScraperUtils.parse_plea_type(plea2) == 'Guilty'
Esempio n. 16
0
 def test_parse_name_error(self):
     name1 = None
     name2 = ''
     assert ScraperUtils.parse_name(name1) == (None, None, None)
     assert ScraperUtils.parse_name(name2) == (None, None, None)
Esempio n. 17
0
def search_portal(case_number):
    """
    Performs a search of the portal from its home page, including selecting the case number input, solving the captcha
    and pressing Search. Also handles the captcha being solved incorrectly
    :param case_number: Case to search
    :return: A set of case number(s).
    """
    # Load portal search page
    load_page(f"{settings['portal-base']}/Home.aspx/Search", 'Search',
              settings['verbose'])
    # Give some time for the captcha to load, as it does not load instantly.
    time.sleep(0.8)

    # Select Case Number textbox and enter case number
    select_case_input()
    case_input = driver.find_element_by_id('caseNumber')
    case_input.click()
    case_input.send_keys(case_number)

    # Solve captcha if it is required
    try:
        # Get Captcha. This is kinda nasty, but if there's no Captcha, then
        # this will throw (which is a good thing in this case) and we can
        # move on with processing.
        captcha_image_elem = driver.find_element_by_xpath(
            '//*/img[@alt="Captcha"]')
        captcha_buffer = captcha_image_elem.screenshot_as_png
        if settings['solve-captchas']:
            solved_captcha = captcha_solver.solve_captcha(captcha_buffer)
            captcha_textbox = driver.find_element_by_xpath(
                '//*/input[@name="captcha"]')
            captcha_textbox.click()
            captcha_textbox.send_keys(solved_captcha.answer)

            # Do search
            search_button = driver.find_element_by_id('searchButton')
            search_button.click()
        else:
            print(f"Captcha encountered trying to view case ID {case_number}.")
            print(
                "Please solve the captcha and click the search button to proceed."
            )
            while True:
                try:
                    WebDriverWait(
                        driver, 6 * 60 *
                        60).until(lambda x: case_number in driver.title)
                    print("continuing...")
                    break
                except TimeoutException:
                    print("still waiting for user to solve the captcha...")

    except NoSuchElementException:
        # No captcha on the page, continue.
        solved_captcha = None
        # Do search
        search_button = driver.find_element_by_id('searchButton')
        search_button.click()

    # If the title stays as 'Search': Captcha solving failed
    # If the title contains the case number or 'Search Results': Captcha solving succeeded
    # If a timeout occurs, retry 'connect-thresh' times.
    for i in range(settings['connect-thresh']):
        try:
            # Wait for page to load
            WebDriverWait(
                driver,
                5).until(lambda x: 'Search' in driver.title or case_number in
                         driver.title or 'Search Results:' in driver.title)
            # Page loaded
            if driver.title == 'Search':
                # Clicking search did not change the page. This could be because of a failed captcha attempt.
                try:
                    # Check if 'Invalid Captcha' dialog is showing
                    driver.find_element_by_xpath(
                        '//div[@class="alert alert-error"]')
                    print("Captcha was solved incorrectly")
                    if settings['solve-captchas'] and solved_captcha:
                        solved_captcha.notify_incorrect()
                except NoSuchElementException:
                    pass
                # Clear cookies so a new captcha is presented upon refresh
                driver.delete_all_cookies()
                # Try solving the captcha again.
                search_portal(case_number)
            elif 'Search Results: CaseNumber:' in driver.title:
                # Captcha solved correctly
                if settings['solve-captchas'] and solved_captcha:
                    solved_captcha.notify_correct()
                case_count = ScraperUtils.get_search_case_count(
                    driver, settings['county'])
                # Case number search found multiple cases.
                if case_count > 1:
                    return ScraperUtils.get_associated_cases(driver)
                # Case number search found no cases
                else:
                    return set()
            elif case_number in driver.title:
                # Captcha solved correctly
                if settings['solve-captchas'] and solved_captcha:
                    solved_captcha.notify_correct()
                # Case number search did find a single court case.
                return {case_number}
        except TimeoutException:
            if i == settings['connect-thresh'] - 1:
                raise RuntimeError(
                    'Case page could not be loaded after {} attempts, or unexpected page title: {}'
                    .format(settings['connect-thresh'], driver.title))
            else:
                search_portal(case_number)
def search_portal(case_number):
    """
    Performs a search of the portal from its home page, including selecting the case number input, solving the captcha
    and pressing Search. Also handles the captcha being solved incorrectly
    :param case_number: Case to search
    :return: A set of case number(s).
    """
    # Load portal search page
    load_page(f"{settings['portal-base']}/Home.aspx/Search", 'Search', settings['verbose'])
    # Give some time for the captcha to load, as it does not load instantly.
    time.sleep(0.8)

    # Select Case Number textbox and enter case number
    select_case_input()
    case_input = driver.find_element_by_id('caseNumber')
    case_input.click()
    case_input.send_keys(case_number)

    if settings['solve-captchas']:
        # Solve captcha if it is required
        try:
            # Get Captcha
            captcha_image_elem = driver.find_element_by_xpath(
                '//*/img[@alt="Captcha"]')
            captcha_buffer = captcha_image_elem.screenshot_as_png
            captcha_answer = captcha_solver.solve_captcha(captcha_buffer)
            captcha_textbox = driver.find_element_by_xpath(
                '//*/input[@name="captcha"]')
            captcha_textbox.click()
            captcha_textbox.send_keys(captcha_answer)
        except NoSuchElementException:
            # No captcha on the page, continue.
            pass

        # Do search
        search_button = driver.find_element_by_id('searchButton')
        search_button.click()
    else:
        raise Exception("Automated captcha solving is disabled by default. Please seek advice before using this feature.")

    # If the title stays as 'Search': Captcha solving failed
    # If the title contains the case number or 'Search Results': Captcha solving succeeded
    # If a timeout occurs, retry 'connect-thresh' times.
    for i in range(settings['connect-thresh']):
        try:
            # Wait for page to load
            WebDriverWait(driver, 5).until(
                lambda x: 'Search' in driver.title or case_number in driver.title or 'Search Results:' in driver.title)
            # Page loaded
            if driver.title == 'Search':
                # Clicking search did not change the page. This could be because of a failed captcha attempt.
                try:
                    # Check if 'Invalid Captcha' dialog is showing
                    driver.find_element_by_xpath(
                        '//div[@class="alert alert-error"]')
                    print("Captcha was solved incorrectly")
                    captcha_solver.notify_last_captcha_fail()
                except NoSuchElementException:
                    pass
                # Clear cookies so a new captcha is presented upon refresh
                driver.delete_all_cookies()
                # Try solving the captcha again.
                search_portal(case_number)
            elif 'Search Results: CaseNumber:' in driver.title:
                # Captcha solved correctly
                captcha_solver.notify_last_captcha_success()
                # Figure out the numer of cases returned
                case_detail_tbl = driver.find_element_by_tag_name('table').text.split('\n')
                case_count_idx = case_detail_tbl.index('CASES FOUND') + 1
                case_count = int(case_detail_tbl[case_count_idx])
                # Case number search found multiple cases.
                if case_count > 1:
                    return ScraperUtils.get_associated_cases(driver)
                # Case number search found no cases
                else:
                    return set()
            elif case_number in driver.title:
                # Captcha solved correctly
                captcha_solver.notify_last_captcha_success()
                # Case number search did find a single court case.
                return {case_number}
        except TimeoutException:
            if i == settings['connect-thresh'] - 1:
                raise RuntimeError('Case page could not be loaded after {} attempts, or unexpected page title: {}'.format(settings['connect-thresh'], driver.title))
            else:
                search_portal(case_number)
 def test_parse_plea_case_numbers__no_charge_mentioned(self):
     plea = "PLEA OF NOT GUILTY"
     assert ScraperUtils.parse_plea_case_numbers(plea, [1]) == []
 def test_parse_plea_case_numbers_one_charge_mentioned(self):
     plea = "DEFENDANT ENTERED PLEA OF : NOLO-CONTENDERE SEQ 2"
     assert ScraperUtils.parse_plea_case_numbers(plea, [1, 2]) == [2]
 def test_parse_out_path_correct_length(self):
     # 260 characters long before the extension. This is an invalid filename in Windows.
     filename_too_long = '01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789'
     parsed_path = ScraperUtils.parse_out_path(os.getcwd(),
                                               filename_too_long, 'txt')
     assert len(parsed_path) <= 256
 def test_parse_out_path_filename_extension_shortening(self):
     # 252 characters long, but with the .pdf extension it becomes 256 characters long - one too many.
     filename = '012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901'
     parsed_filename = ScraperUtils.parse_out_path('', filename, 'pdf')
     assert parsed_filename == '{}.{}'.format(filename[:-1], 'pdf')
 def test_parse_plea_case_numbers_multiple_case_numbers(self):
     plea = "DEFENDANT ENTERED PLEA OF NOLO CONTENDERE SEQ: 1,2,3,4,5"
     assert ScraperUtils.parse_plea_case_numbers(
         plea, [1, 2, 3, 4, 5, 6]) == [1, 2, 3, 4, 5]
def scrape_record(case_number):
    """
    Scrapes a record once the case has been opened.
    :param case_number: The current case's case number.
    """
    # Wait for court summary to load
    for i in range(settings['connect-thresh']):
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'summaryAccordion')))
        except TimeoutException:
            if i == settings['connect-thresh'] - 1:
                raise RuntimeError('Summary details did not load for case {}.'.format(case_number))
            else:
                driver.refresh()

    # Get relevant page content
    summary_table_col1 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd')
    summary_table_col2 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd')
    summary_table_col3 = driver.find_elements_by_xpath('//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd')

    # Wait for court dockets to load
    for i in range(settings['connect-thresh']):
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'gridDocketsView')))
        except TimeoutException:
            if i == settings['connect-thresh'] - 1:
                raise RuntimeError('Dockets did not load for case {}.'.format(case_number))
            else:
                driver.refresh()

    charges_table = driver.find_elements_by_xpath('//*[@id="gridCharges"]/tbody/tr')
    docket_public_defender = driver.find_elements_by_xpath(
        "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]")
    docket_attorney = driver.find_elements_by_xpath("//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]")
    docket_pleas = driver.find_elements_by_xpath("//*[contains(text(), 'PLEA OF')]")
    docket_attachments = driver.find_elements_by_class_name('casedocketimage')

    _id = str(uuid.uuid4())
    _state = settings['state-code']
    _county = settings['county']
    CaseNum = summary_table_col2[1].text.strip()
    AgencyReportNum = summary_table_col1[4].text.strip()
    ArrestDate = None  # Can't be found on this portal
    FilingDate = summary_table_col1[2].text.strip()
    OffenseDate = None  # Can't be found on this portal
    DivisionName = summary_table_col3[3].text.strip()
    CaseStatus = summary_table_col3[1].text.strip()

    if settings['collect-pii']:
        # Create list of assigned defense attorney(s)
        defense_attorney_text = list(map(lambda x: x.text, docket_attorney))
        DefenseAttorney = ScraperUtils.parse_attorneys(defense_attorney_text)
        # Create list of assigned public defenders / appointed attorneys
        public_defender_text = list(map(lambda x: x.text, docket_public_defender))
        PublicDefender = ScraperUtils.parse_attorneys(public_defender_text)
        # Get Judge
        Judge = summary_table_col1[0].text.strip()

        # Download docket attachments.
        # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'.
        if settings['save-attachments']:
            for attachment_link in docket_attachments:
                attachment_text = attachment_link.find_element_by_xpath('./../../td[3]').text.strip()
                if settings['save-attachments'] == 'filing':
                    if not ('CITATION FILED' in attachment_text or 'CASE FILED' in attachment_text):
                        # Attachment is not a filing, don't download it.
                        continue
                ScraperUtils.save_attached_pdf(driver, output_attachments, '{}-{}'.format(case_number, attachment_text),
                                               settings['portal-base'], attachment_link, 20, settings['verbose'])
    else:
        DefenseAttorney = []
        PublicDefender = []
        Judge = None

    Charges = {}
    for charge in charges_table:
        charge_details = charge.find_elements_by_tag_name('td')
        count = int(charge_details[0].text.strip())
        long_desc = charge_details[1].text.strip()
        # Statute is contained within brackets
        if '(' in long_desc and ')' in long_desc:
            statute = long_desc[long_desc.find('(') + 1:long_desc.find(')')]
        else:
            statute = None
        description = long_desc.split('(')[0]
        level = charge_details[2].text.strip()
        degree = charge_details[3].text.strip()
        # plea = charge_details[4].text.strip() # Plea is not filled out on this portal.
        disposition = charge_details[5].text.strip()
        disposition_date = charge_details[6].text.strip()
        offense_date = None  # Not shown on this portal
        citation_number = None  # Not shown on this portal
        Charges[count] = Charge(count, statute, description, level, degree, disposition, disposition_date, offense_date,
                                citation_number, None, None)

    # Pleas are not in the 'plea' field, but instead in the dockets.
    for plea_element in docket_pleas:
        plea_text = plea_element.text.strip()
        plea = ScraperUtils.parse_plea_type(plea_text)
        plea_date = plea_element.find_element_by_xpath('./../td[2]').text.strip()
        plea_number = ScraperUtils.parse_plea_case_numbers(plea_text, list(Charges.keys()))

        # If no case number is specified in the plea, then we assume it applies to all charges in the trial.
        if len(plea_number) == 0:
            for charge in Charges.values():
                charge.plea = plea
                charge.plea_date = plea_date
        else:
            # Apply plea to relevant charge count(s).
            for count in plea_number:
                Charges[count].plea = plea
                Charges[count].plea_date = plea_date

    ArrestingOfficer = None  # Can't be found on this portal
    ArrestingOfficerBadgeNumber = None  # Can't be found on this portal

    profile_link = driver.find_element_by_xpath("//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a").get_attribute(
       'href')
    # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute(
    #     'href')
    load_page(profile_link, 'Party Details:', settings['verbose'])

    Suffix = None
    DOB = None  # This portal has DOB as N/A for every defendent
    Race = driver.find_element_by_xpath(
        '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]').text.strip()
    Sex = driver.find_element_by_xpath(
        '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]').text.strip()
    FirstName = None
    MiddleName = None
    LastName = None
    PartyID = None

    # Only collect PII if configured
    if settings['collect-pii']:
        # Navigate to party profile
        full_name = driver.find_element_by_xpath(
            '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]').text.strip()
        MiddleName = None
        LastName = None
        if ',' in full_name:
            name_split = full_name.split(',')[1].lstrip().split()
            FirstName = name_split[0]
            MiddleName = " ".join(name_split[1:])
            LastName = full_name.split(',')[0]
        else:
            # If there's no comma, it's a corporation name.
            FirstName = full_name
        PartyID = driver.find_element_by_xpath(
            '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]').text.strip()  # PartyID is a field within the portal system to uniquely identify defendants

    record = Record(_id, _state, _county, case_number, CaseNum, AgencyReportNum, PartyID, FirstName, MiddleName,
                    LastName, Suffix, DOB, Race, Sex, ArrestDate, FilingDate, OffenseDate, DivisionName, CaseStatus,
                    DefenseAttorney, PublicDefender, Judge, list(Charges.values()), ArrestingOfficer,
                    ArrestingOfficerBadgeNumber)
    ScraperUtils.write_csv(output_file, record, settings['verbose'])
 def test_parse_plea_case_numbers_messy(self):
     # Test a really ugly plea docket I found in one case
     plea = "PLEA OF NOT GUILTY/DENIAL, WAIVER OF ARRAIGNMENT, DEMAND FOR NOTICE OF EXPERT TESTIMONY, DEMAND FOR DISCOVERY, DEMAND FOR STATEMENT OF PARTICULARS, DEMAND FOR JURY TRIAL, DESIGNATION OF E-MAIL ADDRESSES PURSUANT TO RULE 2.516 1/28/2020"
     assert ScraperUtils.parse_plea_case_numbers(plea, [1, 2, 3]) == []
 def test_parse_plea_type_blank(self):
     assert ScraperUtils.parse_plea_type('') is None
Esempio n. 27
0
def scrape_record(case_number):
    """
    Scrapes a record once the case has been opened.
    :param case_number: The current case's case number.
    """
    # Wait for court summary to load
    for i in range(FLAGS.connect_thresh):
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, 'summaryAccordion')))
        except TimeoutException:
            if i == FLAGS.connect_thresh - 1:
                raise RuntimeError(
                    'Summary details did not load for case {}.'.format(
                        case_number))
            else:
                driver.refresh()

    # Get relevant page content
    summary_table_col1 = driver.find_elements_by_xpath(
        '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[1]/dl/dd')
    summary_table_col2 = driver.find_elements_by_xpath(
        '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[2]/dl/dd')
    summary_table_col3 = driver.find_elements_by_xpath(
        '//*[@id="summaryAccordionCollapse"]/table/tbody/tr/td[3]/dl/dd')

    # Wait for court dockets to load
    for i in range(FLAGS.connect_thresh):
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, 'gridDocketsView')))
        except TimeoutException:
            if i == FLAGS.connect_thresh - 1:
                raise RuntimeError(
                    'Dockets did not load for case {}.'.format(case_number))
            else:
                driver.refresh()

    charges_table = driver.find_elements_by_xpath(
        '//*[@id="gridCharges"]/tbody/tr')
    docket_public_defender = driver.find_elements_by_xpath(
        "//*[contains(text(), 'COURT APPOINTED ATTORNEY') and contains(text(), 'ASSIGNED')]"
    )
    docket_attorney = driver.find_elements_by_xpath(
        "//*[contains(text(), 'DEFENSE') and contains(text(), 'ASSIGNED')]")
    docket_pleas = driver.find_elements_by_xpath(
        "//*[contains(text(), 'PLEA OF')]")
    docket_attachments = driver.find_elements_by_class_name('casedocketimage')

    r = BenchmarkRecordBuilder()
    r.id = str(uuid.uuid4())
    r.state = FLAGS.state
    r.county = FLAGS.county
    r.portal_id = case_number
    r.case_num = Pii.String(summary_table_col2[1].text.strip())
    r.agency_report_num = summary_table_col1[4].text.strip()
    r.arrest_date = None  # Can't be found on this portal
    r.filing_date = summary_table_col1[2].text.strip()
    r.offense_date = None  # Can't be found on this portal
    r.division_name = summary_table_col3[3].text.strip()
    r.case_status = summary_table_col3[1].text.strip()

    # Create list of assigned defense attorney(s)
    defense_attorney_text = list(map(lambda x: x.text, docket_attorney))
    r.defense_attorney = ScraperUtils.parse_attorneys(defense_attorney_text)
    # Create list of assigned public defenders / appointed attorneys
    public_defender_text = list(map(lambda x: x.text, docket_public_defender))
    r.public_defender = ScraperUtils.parse_attorneys(public_defender_text)
    # Get Judge
    r.judge = Pii.String(summary_table_col1[0].text.strip())

    # Download docket attachments.
    # Todo(OscarVanL): This could be parallelized to speed up scraping if save-attachments is set to 'all'.
    if FLAGS.save_attachments:
        for attachment_link in docket_attachments:
            attachment_text = attachment_link.find_element_by_xpath(
                './../../td[3]').text.strip()
            if FLAGS.save_attachments == 'filing':
                if not ('CITATION FILED' in attachment_text
                        or 'CASE FILED' in attachment_text):
                    # Attachment is not a filing, don't download it.
                    continue
            ScraperUtils.save_attached_pdf(
                driver, output_attachments,
                '{}-{}'.format(case_number, attachment_text),
                FLAGS.portal_base, attachment_link, 20, FLAGS.verbose)

    Charges = {}
    for charge in charges_table:
        charge_builder = ChargeBuilder()
        charge_cols = charge.find_elements_by_tag_name('td')
        count = int(charge_cols[0].text.strip())
        charge_builder.count = count

        charge_desc = charge_cols[1].text
        charge_builder.description, charge_builder.statute = (
            ScraperUtils.parse_charge_statute(charge_desc))
        charge_builder.level = charge_cols[2].text.strip()
        charge_builder.degree = charge_cols[3].text.strip()
        # plea = charge_cols[4].text.strip() # Plea is not filled out on this portal.
        charge_builder.disposition = charge_cols[5].text.strip()
        charge_builder.disposition_date = charge_cols[6].text.strip()
        Charges[count] = charge_builder.build()
    r.charges = list(Charges.values())

    # Pleas are not in the 'plea' field, but instead in the dockets.
    for plea_element in docket_pleas:
        plea_text = plea_element.text.strip()
        plea = ScraperUtils.parse_plea_type(plea_text)
        plea_date = plea_element.find_element_by_xpath(
            './../td[2]').text.strip()
        plea_number = ScraperUtils.parse_plea_case_numbers(
            plea_text, list(Charges.keys()))

        # If no case number is specified in the plea, then we assume it applies to all charges in the trial.
        if len(plea_number) == 0:
            for charge in Charges.values():
                charge.plea = plea
                charge.plea_date = plea_date
        else:
            # Apply plea to relevant charge count(s).
            for count in plea_number:
                Charges[count].plea = plea
                Charges[count].plea_date = plea_date

    r.arresting_officer = None  # Can't be found on this portal
    r.arresting_officer_badge_number = None  # Can't be found on this portal

    profile_link = driver.find_element_by_xpath(
        "//table[@id='gridParties']/tbody/tr/*[contains(text(), 'DEFENDANT')]/../td[2]/div/a"
    ).get_attribute('href')
    # profile_link = driver.find_element_by_xpath('//*[@id="gridParties"]/tbody/tr[1]/td[2]/div[1]/a').get_attribute(
    #     'href')
    load_page(profile_link, 'Party Details:', FLAGS.verbose)

    r.suffix = None
    r.dob = None  # This portal has DOB as N/A for every defendent
    r.race = driver.find_element_by_xpath(
        '//*[@id="fd-table-2"]/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[7]/td[2]'
    ).text.strip()
    r.sex = driver.find_element_by_xpath(
        '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[6]/td[2]'
    ).text.strip()

    # Navigate to party profile
    full_name = driver.find_element_by_xpath(
        '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[1]/td[2]'
    ).text.strip()
    r.middle_name = None
    r.last_name = None
    if ',' in full_name:
        r.first_name, r.middle_name, r.last_name = ScraperUtils.parse_name(
            full_name)
    else:
        # If there's no comma, it's a corporation name.
        r.first_name = Pii.String(full_name)
    r.party_id = driver.find_element_by_xpath(
        '//*[@id="mainTableContent"]/tbody/tr/td/table/tbody/tr[2]/td[2]/table[2]/tbody/tr/td[2]/table/tbody/tr[8]/td[2]'
    ).text.strip(
    )  # PartyID is a field within the portal system to uniquely identify defendants

    record = r.build()
    ScraperUtils.write_csv(FLAGS.output, record, FLAGS.verbose)