コード例 #1
0
ファイル: conn.py プロジェクト: freelawproject/juriscraper
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     path = '//table[@id="AutoNumber1"]//ul'
     for ul in html.xpath(path):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         # Skip sections that are marked to be published at future date
         if preceding_text and not preceding_text.lower().endswith(' date'):
             # Below will fail if they change up string format
             date_string = preceding_text.split()[-1]
             case_date = convert_date_string(date_string)
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
コード例 #2
0
ファイル: conn.py プロジェクト: wethepeopleonline/juriscraper
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, 'font', 'br')
     for ul in html.xpath('//table[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul'):
         preceding = ul.xpath('./preceding::*[1]')[0]
         preceding_text = ' '.join(preceding.text_content().split()).strip(':')
         if preceding_text and not preceding_text.lower().endswith('future date'):
             # Below will fail if they change up strings or date formats
             case_date = convert_date_string(preceding_text.split()[-1])
             for element in ul.xpath('./li | ./a'):
                 if element.tag == 'li':
                     text = normalize_dashes(' '.join(element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath('.//a')[0]
                 elif element.tag == 'a':
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = '%s %s' % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(' '.join(glued.split()))
                 self.cases.append({
                     'date': case_date,
                     'url': anchor.xpath('./@href')[0],
                     'docket': text.split('-')[0].replace('Concurrence', '').replace('Dissent', ''),
                     'name': text.split('-', 1)[1],
                 })
コード例 #3
0
ファイル: alaska.py プロジェクト: varun-iyer/juriscraper
 def _get_date_filed_is_approximate(self):
     approximations = []
     for element in self.html.xpath(self.date_string_path):
         count = len(element.xpath(self.sub_opinion_path))
         date_string = normalize_dashes(element.text_content())
         approximation = True if '-' in date_string else False
         approximations.extend([approximation] * count)
     return approximations
コード例 #4
0
 def test_normalize_dashes(self):
     tests = [
         # copied from http://www.w3schools.com/charsets/ref_utf_punctuation.asp
         u" this is    –a test–",  # en dash
         u" this is    —a test—",  # em dash
         u" this is    ‐a test‐",  # hyphen
         u" this is    ‑a test‑",  # non-breaking hyphen
         u" this is    ‒a test‒",  # figure dash
         u" this is    ―a test―",  # horizontal bar
     ]
     target = " this is    -a test-"
     for test in tests:
         self.assertEqual(normalize_dashes(test), target)
コード例 #5
0
 def test_normalize_dashes(self):
     tests = [
         # copied from http://www.w3schools.com/charsets/ref_utf_punctuation.asp
         u' this is    –a test–',  # en dash
         u' this is    —a test—',  # em dash
         u' this is    ‐a test‐',  # hyphen
         u' this is    ‑a test‑',  # non-breaking hyphen
         u' this is    ‒a test‒',  # figure dash
         u' this is    ―a test―',  # horizontal bar
     ]
     target = ' this is    -a test-'
     for test in tests:
         self.assertEqual(normalize_dashes(test), target)
コード例 #6
0
ファイル: conn.py プロジェクト: wilsonqin/juriscraper
 def _extract_cases_from_html(self, html):
     """Build list of data dictionaries, one dictionary per case (table row)."""
     # Strip inconsistently placed <font> and <br>
     # tags that make stable coverage almost impossible
     etree.strip_tags(html, "font", "br")
     path = '//table[@id="AutoNumber1"]//ul'
     for ul in html.xpath(path):
         preceding = ul.xpath("./preceding::*[1]")[0]
         preceding_text = " ".join(
             preceding.text_content().split()).strip(":")
         # Skip sections that are marked to be published at future date
         if preceding_text and not preceding_text.lower().endswith(" date"):
             # Below will fail if they change up string format
             date_string = preceding_text.split()[-1]
             case_date = convert_date_string(date_string)
             for element in ul.xpath("./li | ./a"):
                 if element.tag == "li":
                     text = normalize_dashes(" ".join(
                         element.text_content().split()))
                     if not text:
                         continue
                     anchor = element.xpath(".//a")[0]
                 elif element.tag == "a":
                     # Malformed html, see connappct_example.html
                     anchor = element
                     glued = "%s %s" % (anchor.text_content(), anchor.tail)
                     text = normalize_dashes(" ".join(glued.split()))
                 self.cases.append({
                     "date":
                     case_date,
                     "url":
                     anchor.xpath("./@href")[0],
                     "docket":
                     text.split("-")[0].replace("Concurrence",
                                                "").replace("Dissent", ""),
                     "name":
                     text.split("-", 1)[1],
                 })
コード例 #7
0
 def sanitize_text(self, text):
     """Prevent non-standard characters and typos from breaking regex"""
     return self.fix_court_year_id_typo(clean_string(
         normalize_dashes(text)))
コード例 #8
0
 def sanitize_text(self, text):
     """Prevent non-standard characters and typos from breaking regex"""
     return self.fix_court_year_id_typo(clean_string(normalize_dashes(text)))