Ejemplo n.º 1
0
    def _process_html(self):
        for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'):
            text = clean_string(item.text_content())
            date_string = " ".join(text.split()[0:3])
            try:
                convert_date_string(date_string)
            except:
                raise InsanityException('Unexpected text format: "%s"' % text)
            docket_name = text.replace(date_string, "").strip().lstrip("-")

            # sometimes the records include a docket number(s) as the
            # first words in the second half of the hyphenated string,
            # but some don't include a docket at all.  So we test to see
            # if the first word is numeric (minus the slash characters
            # used to conjoin multiple docket numbers).
            docket, name = docket_name.split(None, 1)
            first_word = docket[0].replace("/", "")
            if not first_word.isnumeric():
                docket = ""
                name = docket_name

            self.cases.append(
                {
                    "date": date_string,
                    "docket": docket,
                    "name": name,
                    "url": item.xpath(".//a/@href")[0],
                }
            )
Ejemplo n.º 2
0
 def is_this_skippable_date_anchor(self, text):
     """Return true is link text is parsible date"""
     try:
         convert_date_string(text)
         return True
     except:
         pass
     return False
Ejemplo n.º 3
0
 def is_this_skippable_date_anchor(self, text):
     """Return true is link text is parsible date"""
     try:
         convert_date_string(text)
         return True
     except:
         pass
     return False
Ejemplo n.º 4
0
 def _parse_date_from_cell_text(self, cell_text):
     date = False
     for text in cell_text:
         try:
             date = text.strip()
             convert_date_string(date)
             break
         except ValueError:
             pass
     return date
Ejemplo n.º 5
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        #
        # PLEASE NOTE: if your adding a new example file, ensure that,
        # if any of the opinion links on the page do not link directly to
        # a pdf url, that you manually edit your example file and add '.pdf'
        # to the end of all those opinion anchor hrefs. We do this in order
        # to prevent the tests form hitting the network.  HINT: if your new
        # test takes any more than a split second to run, its probably hitting
        # the network and needs ot be fixed as explained above.
        #
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Ejemplo n.º 6
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        #
        # PLEASE NOTE: if your adding a new example file, ensure that,
        # if any of the opinion links on the page do not link directly to
        # a pdf url, that you manually edit your example file and add '.pdf'
        # to the end of all those opinion anchor hrefs. We do this in order
        # to prevent the tests form hitting the network.  HINT: if your new
        # test takes any more than a split second to run, its probably hitting
        # the network and needs ot be fixed as explained above.
        #
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.test_mode_enabled():
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.test_mode_enabled():
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Ejemplo n.º 7
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            for ahref in html_l.xpath(self.base_path):
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # process all sub-pages
                if self.next_subpage_path is not None and self.method != 'LOCAL':
                    while True:
                        next_subpage_html = self.get_next_page(
                            html_tree, self.next_subpage_path, request_dict,
                            url)
                        if next_subpage_html is None:
                            break
                        self._extract_cases_from_sub_page(
                            next_subpage_html, date_obj)
                        html_trees.append((next_subpage_html, date_obj))
                        html_tree = next_subpage_html

            if self.method != 'LOCAL':
                next_page_html = self.get_next_page(html_l,
                                                    self.next_page_path,
                                                    request_dict, self.url)
                if next_page_html is not None:
                    html_list.append(next_page_html)

        return html_trees
 def test_fix_future_year_typo(self):
     correct = str(datetime.date.today().year)
     transposed = correct[0] + correct[2] + correct[1] + correct[3]
     expectations = {
         '12/01/%s' % transposed: '12/01/%s' % correct,  # Here's the fix
         '12/01/%s' % correct: '12/01/%s' % correct,     # Should not change
         '12/01/2806': '12/01/2806',                     # Should not change
         '12/01/2886': '12/01/2886',                     # Should not change
     }
     for before, after in expectations.items():
         fixed_date = fix_future_year_typo(convert_date_string(before))
         self.assertEqual(fixed_date, convert_date_string(after))
Ejemplo n.º 9
0
 def test_fix_future_year_typo(self):
     correct = str(datetime.date.today().year)
     transposed = correct[0] + correct[2] + correct[1] + correct[3]
     expectations = {
         "12/01/%s" % transposed: "12/01/%s" % correct,  # Here's the fix
         "12/01/%s" % correct: "12/01/%s" % correct,  # Should not change
         "12/01/2806": "12/01/2806",  # Should not change
         "12/01/2886": "12/01/2886",  # Should not change
     }
     for before, after in expectations.items():
         fixed_date = fix_future_year_typo(convert_date_string(before))
         with self.subTest("Future years", before=before):
             self.assertEqual(fixed_date, convert_date_string(after))
Ejemplo n.º 10
0
 def test_split_date_range_string(self):
     tests = {
         'October - December 2016': convert_date_string('November 16, 2016'),
         'July - September 2016': convert_date_string('August 16, 2016'),
         'April - June 2016': convert_date_string('May 16, 2016'),
         'January March 2016': False,
     }
     for before, after in tests.items():
         if after:
             self.assertEqual(split_date_range_string(before), after)
         else:
             with self.assertRaises(Exception):
                 split_date_range_string(before)
 def test_split_date_range_string(self):
     tests = {
         "October - December 2016":
         convert_date_string("November 16, 2016"),
         "July - September 2016": convert_date_string("August 16, 2016"),
         "April - June 2016": convert_date_string("May 16, 2016"),
         "January March 2016": False,
     }
     for before, after in list(tests.items()):
         if after:
             self.assertEqual(split_date_range_string(before), after)
         else:
             with self.assertRaises(Exception):
                 split_date_range_string(before)
Ejemplo n.º 12
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Ejemplo n.º 13
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Ejemplo n.º 14
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            for ahref in html_l.xpath(self.base_path):
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # process all sub-pages
                if self.next_subpage_path is not None and self.method != 'LOCAL':
                    while True:
                        next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                        if next_subpage_html is None:
                            break
                        self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                        html_trees.append((next_subpage_html, date_obj))
                        html_tree = next_subpage_html

            if self.method != 'LOCAL':
                next_page_html = self.get_next_page(html_l, self.next_page_path, request_dict, self.url)
                if next_page_html is not None:
                    html_list.append(next_page_html)

        return html_trees
Ejemplo n.º 15
0
 def test_ordering_by_date_filed(self):
     """Can we change the ordering?"""
     # First try both orderings in areb (where things have special cases) and
     # ded (Delaware) where things are more normal.
     tests = (
         {'court': 'areb', 'count': 1},
         {'court': 'ded', 'count': 4}
     )
     for test in tests:
         report = self.reports[test['court']]
         some_date = convert_date_string(self.valid_dates[test['court']])
         responses = report.query(some_date, some_date, sort='date_filed')
         results = report.parse(responses)
         self.assertEqual(
             test['count'],
             len(results),
             'Should get %s response for %s' % (test['count'], test['court'])
         )
         responses = report.query(some_date, some_date, sort='case_number')
         results = report.parse(responses)
         self.assertEqual(
             test['count'],
             len(results),
             'should get %s response for %s' % (test['count'], test['court'])
         )
Ejemplo n.º 16
0
 def _get_cases_from_page(self):
     cases = []
     if not self.html.xpath('//body/a'):
         # Exit early for months with no cases (January 2009)
         return cases
     case_date = None
     citation_pattern = '^.{0,5}(\d{4} ND (?:App )?\d{1,4})'
     for element in self.html.xpath('//body/a|//body/font|//body/text()'):
         if hasattr(element, 'tag'):
             if element.tag == 'font' and element.text:
                 case_date = convert_date_string(element.text)
             elif element.tag == 'a' and case_date:
                 name = element.xpath('text()')[0].strip()
                 url = element.xpath('@href')[0]
                 docket = url.split('/')[-1].split('.')[0]
         else:
             # Clean up text to make sure only single spaces between words
             # to ensure that regex pattern works even if clerk accidentally
             # types a tab or multiple spaces
             text = ' '.join(element.strip().split())
             found_citation = re.search(citation_pattern, text, re.MULTILINE)
             if found_citation and found_citation.group(1):
                 citation = found_citation.group(1)
                 if self._should_scrape_case(citation) and name and case_date and docket:
                     cases.append({
                         'citation': citation,
                         'name': name,
                         'date': case_date,
                         'download': 'http://www.ndcourts.gov/wp/%s.wpd' % docket,
                         'docket': docket
                     })
     return cases
Ejemplo n.º 17
0
 def _get_case_dates(self):
     dates = []
     path = "%s//td[3]" % self.path_root
     for cell in self.html.xpath(path):
         date_string = cell.text_content().replace("Aguust", "August")
         dates.append(convert_date_string(date_string))
     return dates
Ejemplo n.º 18
0
 def _get_case_dates(self):
     path = "{base}/td[1]/text()".format(base=self.base)
     return [
         convert_date_string(date.strip())
         for date in self.html.xpath(path)
         if date.strip()
     ]
Ejemplo n.º 19
0
 def _get_case_dates(self):
     dates = []
     path = '%s/td[3]' % self.row_base_path
     for html_tree in self.html:
         for cell in html_tree.xpath(path):
             dates.append(convert_date_string(cell.text_content()))
     return dates
Ejemplo n.º 20
0
 def _return_dates(self, html_tree):
     path = "//*[starts-with(., 'Opinions')]/text()"
     text = html_tree.xpath(path)[0]
     date_string = re.search('.* Week of (.*)', text).group(1).strip()
     case_date = convert_date_string(date_string)
     count = int(html_tree.xpath("count({base})".format(base=self.base_path)))
     return [case_date for i in range(count) ]
Ejemplo n.º 21
0
 def _get_case_dates(self):
     dates = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         date_string = self.grouping_regex.search(s).group(3)
         dates.append(convert_date_string(date_string))
     return dates
Ejemplo n.º 22
0
 def extract_cases_from_html(self, html):
     paths = '//p/strong | //p/b | //p/font/strong | //p/font/b'
     for date_element in html.xpath(paths):
         string = date_element.xpath('./text()')
         try:
             string = string[0]
             # handle legacy example (ga_example.html)
             string = string.split('SUMMARIES')[0]
             date_string = re.sub(r'\W+', ' ', string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath('./..')[0]
         # handle legacy example (ga_example.html)
         while parent.tag != 'p':
             parent = parent.xpath('./..')[0]
         for item in parent.getnext().xpath('./li'):
             text = item.text_content()
             if text:
                 split = text.split('.', 1)
                 self.cases.append({
                     'date': case_date,
                     'url': item.xpath('//a[1]/@href')[0],
                     'docket': split[0].rstrip('.'),
                     'name': titlecase(split[1]),
                 })
Ejemplo n.º 23
0
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     for ul in html.xpath('//table[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul'):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         if preceding_text and not preceding_text.lower().endswith('future date'):
             # Below will fail if they change up strings or date formats
             case_date = convert_date_string(preceding_text.split()[-1])
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
Ejemplo n.º 24
0
 def _get_case_dates(self):
     case_dates = []
     for element in self.html.xpath(self.base_path):
         text = element.text_content()
         date_string = text.split('-')[0].strip()
         case_dates.append(convert_date_string(date_string))
     return case_dates
Ejemplo n.º 25
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath('./creator')[0].text_content()
            pubdate = item.xpath('./pubdate')[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath('./title')[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath('.//@href')[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append({
                'name': name,
                'date': convert_date_string(pubdate_sanitized),
                'docket': docket,
                'judge': self.sanitize_text(creator),
                'url': url,
            })
Ejemplo n.º 26
0
 def extract_cases_from_html(self, html):
     paths = "//p/strong | //p/b | //p/font/strong | //p/font/b"
     for date_element in html.xpath(paths):
         string = date_element.xpath("./text()")
         try:
             string = string[0]
             # handle examples where time but no date (ga_example_3.html)
             if ":" in string and ("AM" in string or "PM" in string):
                 continue
             # handle legacy example (ga_example.html)
             string = string.split("SUMMARIES")[0]
             date_string = re.sub(r"\W+", " ", string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath("./..")[0]
         # handle legacy example (ga_example.html)
         while parent.tag != "p":
             parent = parent.xpath("./..")[0]
         for item in parent.getnext().xpath("./li"):
             text = item.text_content()
             if text:
                 split = text.split(".", 1)
                 self.cases.append({
                     "date": case_date,
                     "url": item.xpath("//a[1]/@href")[0],
                     "docket": split[0].rstrip("."),
                     "name": titlecase(split[1]),
                 })
Ejemplo n.º 27
0
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     path = '//table[@id="AutoNumber1"]//ul'
     for ul in html.xpath(path):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         # Skip sections that are marked to be published at future date
         if preceding_text and not preceding_text.lower().endswith(' date'):
             # Below will fail if they change up string format
             date_string = preceding_text.split()[-1]
             case_date = convert_date_string(date_string)
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
Ejemplo n.º 28
0
 def _get_case_dates(self):
     nodes = self._get_anchor_nodes()
     dates = []
     for node in nodes:
         d_str = node.xpath('./preceding::h3')[0].text_content()
         dates.append(convert_date_string(d_str, fuzzy=True))
     return dates
Ejemplo n.º 29
0
 def _return_dates(html_tree):
     path = "//*[starts-with(., 'Kansas')][contains(., 'Released')]/text()[2]"
     text = html_tree.xpath(path)[0]
     text = re.sub('Opinions Released', '', text)
     case_date = convert_date_string(text.strip())
     return [case_date] * int(
         html_tree.xpath("count(//a[contains(./@href, '.pdf')])"))
Ejemplo n.º 30
0
 def extract_cases_from_html(self, html):
     paths = '//p/strong | //p/b | //p/font/strong | //p/font/b'
     for date_element in html.xpath(paths):
         string = date_element.xpath('./text()')
         try:
             string = string[0]
             # handle examples where time but no date (ga_example_3.html)
             if ':' in string and ('AM' in string or 'PM' in string):
                 continue
             # handle legacy example (ga_example.html)
             string = string.split('SUMMARIES')[0]
             date_string = re.sub(r'\W+', ' ', string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath('./..')[0]
         # handle legacy example (ga_example.html)
         while parent.tag != 'p':
             parent = parent.xpath('./..')[0]
         for item in parent.getnext().xpath('./li'):
             text = item.text_content()
             if text:
                 split = text.split('.', 1)
                 self.cases.append({
                     'date': case_date,
                     'url': item.xpath('//a[1]/@href')[0],
                     'docket': split[0].rstrip('.'),
                     'name': titlecase(split[1]),
                 })
Ejemplo n.º 31
0
 def _get_case_dates(self):
     dates = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         date_string = self.grouping_regex.search(s).group(3)
         dates.append(convert_date_string(date_string))
     return dates
Ejemplo n.º 32
0
 def _get_case_dates(self):
     case_dates = []
     for element in self.html.xpath(self.base_path):
         text = element.text_content()
         date_string = text.split('-')[0].strip()
         case_dates.append(convert_date_string(date_string))
     return case_dates
Ejemplo n.º 33
0
    def _extract_case_data_from_html(self, html):
        """Build list of data dictionaries, one dictionary per case."""
        regex = re.compile(
            r'^Citation Nr: (.*) Decision Date: (.*) Archive Date: (.*) DOCKET NO. ([-0-9 ]+)'
        )

        for result in html.xpath('//div[@id="results-area"]/div/a'):
            text = result.text_content().strip()
            try:
                (citation, date, docket) = regex.match(text).group(1, 2, 4)
            except:
                raise Exception(
                    'regex failure in _extract_case_data_from_html method of bva scraper'
                )

            # There is a history to this, but the long story short is that we
            # are using the docket number in the name field intentionally.
            self.cases.append({
                'name': docket,
                'url': result.xpath('.//@href')[0],
                'date': convert_date_string(date),
                'status': 'Unpublished',
                'docket': docket,
                'citation': citation.split()[0],
            })
Ejemplo n.º 34
0
    def extract_date_summary_from_link(self, link):
        # Link should be within a <p> tag directly under <div id='maincontent'>, but
        # occasionally the courts forgets the wrap it in a <p>, in which case it should
        # be directly under the <div id='maincontent'>
        container_id = "maincontent"
        parent = link.getparent()
        parents_parent = parent.getparent()
        if "id" in parent.attrib and parent.attrib["id"] == container_id:
            search_root = link
        elif ("id" in parents_parent.attrib
              and parents_parent.attrib["id"] == container_id):
            search_root = parent
        else:
            raise InsanityException(
                'Unrecognized placement of Opinion url on page: "%s"' %
                link.text_content().strip())

        # Find date from bolded header element above link (ex: "5-14-2014 - Opinions" or "5-21-2014 - Orders")
        element_date = search_root.xpath("./preceding-sibling::b")[-1]
        element_date_text = element_date.text_content().strip().lower()
        try:
            date_string = element_date_text.split()[0]
        except:
            raise InsanityException('Unrecognized bold (date) element: "%s"' %
                                    element_date_text)

        # Find summary from blockquote element below link
        element_blockquote = search_root.xpath(
            "./following-sibling::blockquote")[0]
        summary = element_blockquote.text_content().strip()

        return convert_date_string(date_string), summary
Ejemplo n.º 35
0
 def _extract_cases_from_html(self, html):
     year = self.extract_year_from_h1(html)
     path_dates = "//h2[contains(., '%s')]" % year
     for h2 in html.xpath(path_dates):
         text_date = h2.text_content().strip()
         if not text_date or 'No opinions released' in text_date:
             continue
         date_string = self.regex_date.search(text_date).group(1)
         date = convert_date_string(date_string)
         next_tag = h2.getnext().tag
         if not self.cases and next_tag == 'p':
             # Sometimes court puts most recent date's opinions in
             # its own box at the top of the page, in a format that
             # doesn't conform with the other date sections below
             path = './/a[%s]' % self.path_anchor_qualifier
             anchors = h2.getparent().xpath(path)
         else:
             path = './following::ul[1]//a[%s]' % self.path_anchor_qualifier
             anchors = h2.xpath(path)
         for anchor in anchors:
             text_anchor = anchor.text_content()
             match = self.regex.search(text_anchor)
             if not match:
                 continue
             self.cases.append({
                 'date': date,
                 'docket': match.group(1),
                 'name': match.group(2).strip(),
                 'url': anchor.xpath('./@href')[0],
             })
Ejemplo n.º 36
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath('./creator')[0].text_content()
            pubdate = item.xpath('./pubdate')[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath('./title')[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath('.//@href')[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append({
                'name': name,
                'date': convert_date_string(pubdate_sanitized),
                'docket': docket,
                'judge': self.sanitize_text(creator),
                'url': url,
            })
Ejemplo n.º 37
0
 def _get_cases_from_page(self):
     cases = []
     if not self.html.xpath('//body/a'):
         # Exit early for months with no cases (January 2009)
         return cases
     case_date = None
     citation_pattern = '^.{0,5}(\d{4} ND (?:App )?\d{1,4})'
     for element in self.html.xpath('//body/a|//body/font|//body/text()'):
         if hasattr(element, 'tag'):
             if element.tag == 'font' and element.text:
                 case_date = convert_date_string(element.text)
             elif element.tag == 'a' and case_date:
                 name = element.xpath('text()')[0].strip()
                 url = element.xpath('@href')[0]
                 docket = url.split('/')[-1].split('.')[0]
         else:
             # Clean up text to make sure only single spaces between words
             # to ensure that regex pattern works even if clerk accidentally
             # types a tab or multiple spaces
             text = ' '.join(element.strip().split())
             found_citation = re.search(citation_pattern, text, re.MULTILINE)
             if found_citation and found_citation.group(1):
                 citation = found_citation.group(1)
                 if self._should_scrape_case(citation) and name and case_date and docket:
                     cases.append({
                         'citation': citation,
                         'name': name,
                         'date': case_date,
                         'download': 'http://www.ndcourts.gov/wp/%s.wpd' % docket,
                         'docket': docket
                     })
     return cases
Ejemplo n.º 38
0
 def _get_case_dates(self):
     dates = []
     path = "%s/td[3]" % self.row_base_path
     for html_tree in self.html:
         for cell in html_tree.xpath(path):
             dates.append(convert_date_string(cell.text_content()))
     return dates
Ejemplo n.º 39
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath("./creator")[0].text_content()
            pubdate = item.xpath("./pubdate")[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath("./title")[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath(".//@href")[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append(
                {
                    "name": name,
                    "date": convert_date_string(pubdate_sanitized),
                    "docket": docket,
                    "judge": self.sanitize_text(creator),
                    "url": url,
                }
            )
Ejemplo n.º 40
0
    def _extract_case_data_from_html(self, html):
        """Build list of data dictionaries, one dictionary per case."""
        regex = re.compile(
            r'^Citation Nr: (.*) Decision Date: (.*) Archive Date: (.*) DOCKET NO. ([-0-9 ]+)'
        )

        for result in html.xpath('//div[@id="results-area"]/div/a'):
            text = result.text_content().strip()
            try:
                (citation, date, docket) = regex.match(text).group(1, 2, 4)
            except:
                raise Exception(
                    'regex failure in _extract_case_data_from_html method of bva scraper'
                )

            # There is a history to this, but the long story short is that we
            # are using the docket number in the name field intentionally.
            self.cases.append({
                'name': docket,
                'url': result.xpath('.//@href')[0],
                'date': convert_date_string(date),
                'status': 'Unpublished',
                'docket': docket,
                'citation': citation.split()[0],
            })
Ejemplo n.º 41
0
 def _get_case_dates(self):
     path = '//table[contains(.//th[1], "Opinion")]//tr/td[3]'
     dates = []
     for cell in self.html.xpath(path):
         date_string = cell.text_content().replace('Aguust', 'August')
         dates.append(convert_date_string(date_string))
     return dates
Ejemplo n.º 42
0
 def _return_dates(html_tree):
     path = "//h1|//h2"
     dates = []
     text = html_tree.xpath(path)[0].text_content().strip()
     case_date = convert_date_string(text)
     dates.extend([case_date] * int(html_tree.xpath("count(//th//a[contains(., '/')])")))
     return dates
Ejemplo n.º 43
0
    def extract_date_summary_from_link(self, link):
        # Link should be within a <p> tag directly under <div id='maincontent'>, but
        # occasionally the courts forgets the wrap it in a <p>, in which case it should
        # be directly under the <div id='maincontent'>
        container_id = 'maincontent'
        parent = link.getparent()
        parents_parent = parent.getparent()
        if 'id' in parent.attrib and parent.attrib['id'] == container_id:
            search_root = link
        elif 'id' in parents_parent.attrib and parents_parent.attrib['id'] == container_id:
            search_root = parent
        else:
            raise InsanityException('Unrecognized placement of Opinion url on page: "%s"' % link.text_content().strip())

        # Find date from bolded header element above link (ex: "5-14-2014 - Opinions" or "5-21-2014 - Orders")
        element_date = search_root.xpath('./preceding-sibling::b')[-1]
        element_date_text = element_date.text_content().strip().lower()
        if not element_date_text.endswith('opinions') and not element_date_text.endswith('orders'):
            raise InsanityException('Unrecognized bold (date) element: "%s"' % element_date_text)
        date_string = element_date_text.split()[0]

        # Find summary from blockquote element below link
        element_blockquote = search_root.xpath('./following-sibling::blockquote')[0]
        summary = element_blockquote.text_content().strip()

        return convert_date_string(date_string), summary
Ejemplo n.º 44
0
    def _extract_case_data_from_html(self, html):
        """Build list of data dictionaries, one dictionary per case.

        Sometimes the XML is malformed, usually because of a missing docket number,
        which throws the traditional data list matching off.  Its easier and cleaner
        to extract all the data at once, and simply skip over records that do not
        present a docket number.
        """
        for document in html.xpath("//document"):
            docket = document.xpath('content[@name="docket"]/text()')
            if docket:
                docket = docket[0]
                title = document.xpath('content[@name="dc.title"]/text()')[0]
                name = self._parse_name_from_title(title)
                url = document.xpath("@url")[0]
                if any(s in url for s in self.court_filters):
                    # Only append cases that are in the right jurisdiction.
                    self.cases.append(
                        {
                            "name": name,
                            "url": self._file_path_to_url(
                                document.xpath("@url")[0]
                            ),
                            "date": convert_date_string(
                                document.xpath('content[@name="date"]/text()')[
                                    0
                                ]
                            ),
                            "status": self._parse_status_from_title(title),
                            "docket": docket,
                        }
                    )
Ejemplo n.º 45
0
 def _return_dates(html_tree):
     path = "//h1|//h2"
     dates = []
     text = html_tree.xpath(path)[0].text_content().strip()
     case_date = convert_date_string(text)
     dates.extend([case_date] *
                  int(html_tree.xpath("count(//th//a[contains(., '/')])")))
     return dates
Ejemplo n.º 46
0
    def test_extract_written_documents_report(self):
        """Do all the written reports work?"""

        for court in self.courts:
            if court['type'] == "U.S. Courts of Appeals":
                continue
            court_id = get_court_id_from_url(court['court_link'])

            if court_id not in self.valid_dates:
                continue

            results = []
            report = self.reports[court_id]
            some_date = convert_date_string(self.valid_dates[court_id])
            retry_count = 1
            max_retries = 5  # We'll try five times total
            while not results and retry_count <= max_retries:
                # This loop is sometimes needed to find a date with documents.
                # In general the valid dates json object should suffice,
                # however.
                if some_date > date.today():
                    raise ValueError("Runaway date query for %s: %s" %
                                     (court_id, some_date))
                try:
                    report.query(some_date, some_date, sort='case_number')
                except ConnectionError as e:
                    if retry_count <= max_retries:
                        print("%s. Trying again (%s of %s)" %
                              (e, retry_count, max_retries))
                        time.sleep(10)  # Give the server a moment of rest.
                        retry_count += 1
                        continue
                    else:
                        print("%s: Repeated errors at this court." % e)
                        raise e
                if not report.responses:
                    break  # Not a supported court.
                some_date += timedelta(days=1)

            else:
                # While loop ended normally (without hitting break)
                for result in results:
                    for k, v in result.items():
                        if k in ['nature_of_suit', 'cause']:
                            continue
                        self.assertIsNotNone(
                            v,
                            msg="Value of key %s is None in court %s" %
                                (k, court_id)
                        )

                # Can we download one item from each court?
                r = report.download_pdf(results[0]['pacer_case_id'],
                                        results[0]['pacer_doc_id'])
                if r is None:
                    # Extremely messed up download.
                    continue
                self.assertEqual(r.headers['Content-Type'], 'application/pdf')
Ejemplo n.º 47
0
 def test_query_can_get_multiple_results(self):
     """
     Can we run a query that gets multiple rows and parse them all?
     """
     court_id = 'paeb'
     report = self.reports[court_id]
     some_date = convert_date_string(self.valid_dates[court_id])
     report.query(some_date, some_date, sort='case_number')
     self.assertEqual(3, len(report.data), 'should get 3 responses for ksb')
Ejemplo n.º 48
0
 def _parse_date_from_cell_text(self, cell_text):
     date = False
     for text in cell_text:
         try:
             date = convert_date_string(text.strip())
             break
         except ValueError:
             pass
     return date
Ejemplo n.º 49
0
 def _get_case_dates(self):
     case_dates = []
     for element in self.html.xpath('//caption | //center'):
         date_string = element.text_content().strip().replace('Cases Decided ', '')
         path_prefix = './parent::' if element.tag == 'caption' else './following-sibling::'
         path = path_prefix + 'table[1]' + self.row_base_path
         cases = element.xpath(path)
         case_dates.extend([convert_date_string(date_string)] * len(cases))
     return case_dates
 def _get_case_dates(self):
     """ This is an example of a date field. Note that the format string
         will likely need to be updated to  match the date formats
         on the site you are scraping. The datetime formats can be found
         here: http://docs.python.org/2/library/datetime.html
     """
     path = '//path/to/text/text()'
     return [convert_date_string(date_string) for date_string in
             self.html.xpath(path)]
Ejemplo n.º 51
0
 def is_this_skippable_date_anchor(self, text, date_obj):
     """Return true is link text is parsible date"""
     try:
         string_to_date = convert_date_string(text)
         if string_to_date == date_obj:
             return True
     except:
         pass
     return False
Ejemplo n.º 52
0
 def test_query_using_last_good_row(self):
     """
     Can we run a query that triggers no content in first cell?
     """
     court_id = 'ksb'
     report = self.reports[court_id]
     some_date = convert_date_string(self.valid_dates[court_id])
     report.query(some_date, some_date, sort='case_number')
     self.assertEqual(2, len(report.data), 'should get 2 response for ksb')
Ejemplo n.º 53
0
 def _get_case_dates(self):
     dates = []
     path = 'id("content")/div//strong'
     sub_path = './following-sibling::ul[1]//li|../following-sibling::ul[1]//li'
     for element in self.html.xpath(path):
         date = convert_date_string(element.xpath('text()')[0])
         for case in element.xpath(sub_path):
             dates.append(date)
     return dates
Ejemplo n.º 54
0
 def _get_case_dates(self):
     """All we have are years, so estimate middle most day of year"""
     today = datetime.date.today()
     middle_of_year = convert_date_string('July 2, %d' % self.year)
     if self.year == today.year:
         # Not a backscraper, assume cases were filed on day scraped.
         return [today] * len(self.html.xpath(self.row_path))
     else:
         return [middle_of_year] * len(self.html.xpath(self.row_path))
Ejemplo n.º 55
0
 def _get_case_dates(self):
     today = datetime.date.today()
     count = len(self._get_case_names())
     middle_of_year = convert_date_string('July 2, %d' % self.year)
     if self.year == today.year:
         # Not a backscraper, assume cases were filed on day scraped.
         return [today] * count
     else:
         # All we have is the year, so estimate the middle most day
         return [middle_of_year] * count
Ejemplo n.º 56
0
 def _get_case_dates(self):
     """All we have are years, so estimate middle most day of year"""
     self.set_dynamic_resource_paths()
     dates = []
     for section in self.html.xpath(self.section_path):
         year = section.xpath(self.year_sub_path)[0].text_content().strip()
         date = convert_date_string('July 2, %s' % year)
         count = len(section.xpath(self.opinion_sub_path))
         dates.extend([date] * count)
     return dates
Ejemplo n.º 57
0
 def _get_case_dates(self):
     dates = []
     for text_string in self.html.xpath('//text()'):
         if not text_string.lower().startswith('filed'):
             continue
         else:
             date_string = text_string.split(' ')[1]
             date_string = date_string.strip().strip(',')
             dates.append(convert_date_string(date_string))
     return dates