Beispiel #1
0
    def _post_parse(self):
        """Unfortunately, some of the items do not have audio files despite
        appearing in the table and having a link to a supplementary audio page.

        For these items, we set the download_url to '' and this method finds
        the related information for those items and then removes it from all
        the other attributes for the Site object.
        """
        # Start by checking sanity. This will make sure we don't mess things
        # up. If this sanity check fails, we'll know things were messed up
        # before we began tinkering with them.
        self._check_sanity()

        # Items are purged in two steps. First, we identify the index of the
        # items that need purging.
        purge_indexes = []
        for i, url in enumerate(self.download_urls):
            if not url:
                purge_indexes.append(i)

        # Quick check: We did find *some* urls, right?
        if len(purge_indexes) == len(self.download_urls):
            raise InsanityException("Didn't get any download URLs. Looks like "
                                    "something is wrong in the _post_parse() "
                                    "method.")

        # Second, we purge them, beginning at the end and moving forwards. This
        # ensures that we don't delete the wrong items.
        for index_to_purge in sorted(purge_indexes, reverse=True):
            for attr in self._all_attrs:
                item = getattr(self, attr)
                if item is not None:
                    # If we've added stuff to it, then delete the key.
                    del item[index_to_purge]
Beispiel #2
0
    def parse_name_from_text(text_list):
        regexes = [
            # Expected format
            '(.*?)(,?\sNos?\.)(.*?)',
            # Clerk typo, forgot "No."/"Nos." substring
            '(.*?)(,?\s\d+-\d+(,|\s))(.*?)',
            # Same as above, and there's an unconventional docket number
            # like 'SU-14-324' instead of '14-324'. See ri_p_example_4.html
            '(.*?)(,?\s(?:\w+-)?\d+-\d+(,|\s))(.*?)',
        ]

        for regex in regexes:
            for text in text_list:
                name_match = re.match(regex, text)
                if name_match:
                    return name_match.group(1)

        # "No."/"Nos." and docket missing, fall back on whatever's before first
        # semi-colon
        for text in text_list:
            if ';' in text:
                return text.split(';')[0]

        raise InsanityException('Could not parse name from string: "%s"' %
                                text_list)
Beispiel #3
0
 def _extract_name_from_text(cls, text):
     try:
         match = re.match(cls.regex, text).group(12)
     except:
         raise InsanityException('Unable to parse case name from "%s"' %
                                 text)
     return match.strip().rstrip('.')
Beispiel #4
0
 def _extract_docket_from_text(cls, text):
     try:
         match = re.match(cls.regex, text).group(6)
     except:
         raise InsanityException('Unable to parse docket from "%s"' % text)
     dockets_raw = match.rstrip('.').replace('&', ' ').replace(',', ' ')
     dockets = dockets_raw.split()
     return ', '.join(dockets)
Beispiel #5
0
 def return_opinion_path(self):
     paths = [
         '//select/option[contains(@value, ".pdf")]',
         '//ul/li/a[contains(@href, ".pdf")]',
     ]
     for path in paths:
         if self.html.xpath(path):
             return path
     raise InsanityException('No recognized path to opinion listings')
 def set_table_headers(self, html):
     # Do nothing if table is missing
     if html.xpath(self.path_table):
         path = '%s//th' % self.path_table
         self.headers = [
             cell.text_content().strip() for cell in html.xpath(path)
         ]
         # Ensure that expected/required headers are present
         if not set(self.required_headers).issubset(self.headers):
             raise InsanityException('Required table column missing')
Beispiel #7
0
 def return_section_path(self):
     paths = [
         '//div[contains(@class, "panel-default")]',
         '//td[contains(p/@class, "center")]',
         '//td[contains(p/@align, "center")]',
         '//td[contains(h2/@class, "center")]',
         '//div[contains(h3/@class, "center")]',
         '//div[contains(h3/@align, "center")]',
     ]
     for path in paths:
         if self.html.xpath(path):
             return path
     raise InsanityException('No recognized path to opinion sections')
Beispiel #8
0
 def return_year_sub_path(self):
     parent = self.html.xpath(self.section_path)[0]
     paths = [
         './div[contains(@class, "panel-heading")]/label',
         './p[contains(@class, "center")]/strong',
         './p[contains(@align, "center")]/font/b',
         './h2[contains(@class, "center")]',
         './h3[contains(@class, "center")]',
         './h3[contains(@align, "center")]',
     ]
     for path in paths:
         if parent.xpath(path):
             return path
     raise InsanityException('No recognized path to year string')
Beispiel #9
0
    def parse_date_from_text(self, text_list):
        regex = '(.*?)(\((\w+\s+\d+\,\s+\d+)\))(.*?)'
        for text in text_list:
            date_match = re.match(regex, text)
            if date_match:
                return convert_date_string(date_match.group(3))

        # Fall back on previous case's date
        if self.previous_date:
            return self.previous_date

        raise InsanityException(
            'Could not parse date from string, and no previous date to fall '
            'back on: "%s"' % text_list
        )
Beispiel #10
0
    def _download(self, request_dict={}):
        """This is another of the cursed MS asp.net pages with damned POST
          parameters like __EVENTVALIDATION. These are near impossible to
          scrape without using Selenium.
        """
        if self.method == 'LOCAL':
            return super(Site, self)._download(request_dict=request_dict)
        else:
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
                # Without these args, when you get self.url, you'll still be at
                # about:config because the SSL on this site is so terrible.
                service_args=[
                    '--ignore-ssl-errors=true', '--ssl-protocol=tlsv1'
                ],
            )
            driver.implicitly_wait(30)
            logger.info("Now downloading case page at: %s" % self.url)
            driver.get(self.url)

            # Select the correct drop downs, then submit.
            path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format(
                type=self.opinion_type)
            driver.find_element_by_xpath(path_to_opinion_type).click()
            path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format(
                d=self.release_date)

            try:
                driver.find_element_by_xpath(path_to_date).click()
            except NoSuchElementException:
                # This is not uncommon early in the month (or if there are
                # no opinions published in the current month), so failures
                # resulting from this raise can probably be ignored.
                warning = 'Current month (%s) not yet available in portal--common occurrence early in the month.'
                raise InsanityException(warning % self.release_date)

            path_to_submit = "//input[@id='cmdSearch']"
            driver.find_element_by_xpath(path_to_submit).click()

            # Selenium doesn't give us the actual code, we have to hope.
            self.status = 200

            text = self._clean_text(driver.page_source)
            html_tree = html.fromstring(text)
            html_tree.rewrite_links(fix_links_in_lxml_tree,
                                    base_href=self.request['url'])
        return html_tree
Beispiel #11
0
 def parse_title(txt):
     try:
         name_and_citation = txt.rsplit('(', 1)[0].strip()
         docket_number = re.search('(.*\d).*?',
                                   txt.rsplit('(', 1)[1]).group(0).strip()
         case_name = name_and_citation.rsplit(",", 1)[0].strip()
         try:
             neutral_cite = name_and_citation.rsplit(",", 1)[1].strip()
             if not re.search('^\d\d.*\d\d$', neutral_cite):
                 neutral_cite = ''
         except IndexError:
             # Unable to find comma to split on. No neutral cite.
             neutral_cite = ''
     except:
         raise InsanityException("Unable to parse: %s\n%s" %
                                 (txt, traceback.format_exc()))
     return case_name, neutral_cite, docket_number
Beispiel #12
0
    def _normalize_dockets(dockets):
        #This page lists these about five different ways, normalizing:
        dockets = re.sub(r'Nos?\.', '', dockets)

        result = []
        for docket in dockets.split(", "):
            docket = docket.strip()
            if re.match(r"^\d+$", docket):  # number
                result.append(docket)
            elif re.match(r"^\d+\-\d+$", docket):  # number-number
                result.append(docket)
            elif re.match(r"^\d+\,\d+$", docket):  # number,number
                # fix the docket number
                docket = docket.replace(",", "-")
                result.append(docket)
            else:
                raise InsanityException("Unknown docket number format '%s'" %
                                        (docket, ))

        # reassemble the docket numbers into one string
        return ", ".join(result)
Beispiel #13
0
 def _get_case_dates(self):
     case_dates = []
     path = "//table[@id = 'onetidDoclibViewTbl0']/tr[position() > 1]/td/a/span/text()"
     previous = None
     error_count = 0
     for s in self.html.xpath(path):
         try:
             date_string = re.search(self.regex, s, re.MULTILINE).group(3)
             d = datetime.strptime(date_string.strip(),
                                   '(%B %d, %Y)').date()
             case_dates.append(d)
             previous = d
             error_count = 0
         except AttributeError:
             # Happens when the regex fails. Use the previous date and set
             # error_count back to zero.
             error_count += 1
             if error_count == 2:
                 raise InsanityException(
                     "Regex appears to be failing in Rhode Island")
             else:
                 case_dates.append(previous)
     return case_dates
Beispiel #14
0
 def return_url_path(self):
     if '/option' in self.opinion_path:
         return '%s/@value' % self.opinion_path
     elif '/li/a' in self.opinion_path:
         return '%s/@href' % self.opinion_path
     raise InsanityException('No recognized path to url')