Python clean_if_py3 Examples, juriscraper.lib.string_utils.clean_if_py3 Python Examples

Example #1

0

Show file

File: asbca.py Project: wethepeopleonline/juriscraper

 def _get_case_names(self):
     path = "//table/tr/td/a[1]"
     case_names = [
         clean_if_py3("".join(txt.itertext()).strip())
         for txt in self.html.xpath(path)
     ]
     return case_names

Example #2

0

Show file

File: iowa.py Project: voutilad/juriscraper

 def _return_dates(html_tree):
     path = "//*[contains(concat(' ',@id,' '),' wfHeader') and not(contains(., 'Iowa'))]/text()"
     dates = []
     text = clean_if_py3(html_tree.xpath(path)[0])
     case_date = date.fromtimestamp(time.mktime(time.strptime(text.strip(), '%B %d, %Y')))
     dates.extend([case_date] * int(html_tree.xpath("count(//*[contains(concat(' ',@id,' '),' wfLabel')])")))
     return dates

Example #3

0

Show file

File: uscfc.py Project: janderse/juriscraper

    def _get_judges(self):
        path = '//div[@class="feed-item-body"]'
        judges = []
        splitters = [
            'Signed by Chief Judge',
            'Signed by Judge',
            'Signed by Chief Special Master',  # Vaccine courts have odd names for judges
            'Signed by Special Master',
        ]
        for e in self.html.xpath(path):
            t = html.tostring(e, method='text', encoding='unicode')
            t = clean_if_py3(t).split('Keywords:')[0]
            for splitter in splitters:
                judge_parts = t.rsplit(splitter)
                if len(judge_parts) == 1:
                    # No splits found...
                    judge = ''
                    continue
                else:
                    judge = judge_parts[1]
                    break

            # Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only
            # want the name, not the rest.
            length_of_match = 2
            m = re.search(r'[a-z]{%s}\.' % length_of_match, judge)  # Two lower case letters followed by a period
            if m:
                judge = judge[:m.start() + length_of_match]
            else:
                judge = ''
            judge.strip('.')
            judges.append(judge)
        return judges

Example #4

0

Show file

File: ohio.py Project: theophile/juriscraper

 def _get_case_dates(self):
     path = "{base}/following::td[4]//text()".format(base=self.base_path)
     dates = []
     for s in self.html.xpath(path):
         dates.append(
             datetime.strptime(clean_if_py3(s).strip(), "%m/%d/%Y").date())
     return dates

Example #5

0

Show file

    def _get_judges(self):
        path = '//div[@class="feed-item-body"]'
        judges = []
        splitters = [
            'Signed by Chief Judge',
            'Signed by Judge',
            'Signed by Chief Special Master',  # Vaccine courts have odd names for judges
            'Signed by Special Master',
        ]
        for e in self.html.xpath(path):
            t = html.tostring(e, method='text', encoding='unicode')
            t = clean_if_py3(t).split('Keywords:')[0]
            for splitter in splitters:
                judge_parts = t.rsplit(splitter)
                if len(judge_parts) == 1:
                    # No splits found...
                    judge = ''
                    continue
                else:
                    judge = judge_parts[1]
                    break

            # Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only
            # want the name, not the rest.
            length_of_match = 2
            m = re.search(r'[a-z]{%s}\.' % length_of_match,
                          judge)  # Two lower case letters followed by a period
            if m:
                judge = judge[:m.start() + length_of_match]
            else:
                judge = ''
            judge.strip('.')
            judges.append(judge)
        return judges

Example #6

0

Show file

 def _get_date_object_from_string(self, date_string):
     date_string = (
         clean_if_py3(date_string)
         .strip()
         .replace(" ,", ", ")
         .replace("2104", "2014")
     )
     return convert_date_string(date_string)

Example #7

0

Show file

 def _get_case_dates(self):
     case_dates = []
     for txt in [item.xpath('./description/text()')[0] for item in self.items]:
         # I can't see it, but there's apparently whitespace or a newline
         # at the end of these dates that has to be removed or we error out.
         case_date = clean_if_py3(txt).split('about ', 1)[1].strip()
         case_dates.append(datetime.strptime(case_date, '%m/%d/%Y').date())
     return case_dates

Example #8

0

Show file

File: mich.py Project: voutilad/juriscraper

 def _get_case_dates(self):
     dates = []
     for txt in self.html.xpath('//li[@class="releaseDate"]/text()'):
         # Release Date: 2/22/2013 --> 2/22/2013
         txt = clean_if_py3(txt).strip().split(' ')[2]
         dates.append(date.fromtimestamp(time.mktime(time.strptime(
             txt.strip(), '%m/%d/%Y'))))
     return dates

Example #9

0

Show file

File: fladistctapp_2.py Project: wethepeopleonline/juriscraper

 def _return_case_names(html_tree):
     path = "//th//a[contains(., '/')]/text()"
     case_names = []
     for name in html_tree.xpath(path):
         name = clean_if_py3(name).strip()
         if name:
             case_names.append(name)
     return case_names

Example #10

0

Show file

File: ca4.py Project: cgruppioni/juriscraper

 def _get_case_dates(self):
     path = '//tr/td[3]/text()'
     case_dates = []
     for date_string in self.html.xpath(path):
         date_string = clean_if_py3(date_string).strip()
         if date_string:
             case_dates.append(datetime.strptime(date_string, '%Y/%m/%d').date())
     return case_dates

Example #11

0

Show file

 def _get_docket_numbers(self):
     if "ASBCA Number" not in self.columns:
         return None
     path = "//table/tr[td/a]/td[%d]/text()" % self.columns["ASBCA Number"]
     return [
         ("ASBCA No. " + clean_if_py3(txt).strip())
         for txt in self.html.xpath(path)
     ]

Example #12

0

Show file

File: ca4.py Project: cgruppioni/juriscraper

 def _get_docket_numbers(self):
     path = '//tr/td[2]//text()'
     docket_numbers = []
     for s in self.html.xpath(path):
         s = clean_if_py3(s).strip()
         if s:
             docket_numbers.append(s)
     return docket_numbers

Example #13

0

Show file

File: fladistctapp_2.py Project: voutilad/juriscraper

 def _return_case_names(html_tree):
     path = "//th//a[contains(., '/')]/text()"
     case_names = []
     for name in html_tree.xpath(path):
         name = clean_if_py3(name).strip()
         if name:
             case_names.append(name)
     return case_names

Example #14

0

Show file

File: nev_p.py Project: janderse/juriscraper

    def _get_case_dates(self):
        case_dates = []
        for el in self.html.xpath(self.date_path):
            date_string = clean_if_py3(str(el)).strip()
            if date_string:
                case_dates.append(datetime.strptime(date_string, '%b %d, %Y').date())

        return case_dates

Example #15

0

Show file

    def _sanitize_docket_name_text(self, text):
        text = clean_if_py3(text).strip()
        first_word = text.split()[0]

        # Replace en dash typo with proper hyphen so regex parses properly
        en_dash = b'\xe2\x80\x93'.decode('utf-8')
        first_word_sanitized = first_word.replace(en_dash, '-')

        return text.replace(first_word, first_word_sanitized)

Example #16

0

Show file

 def _get_case_dates(self):
     path = '//item/description/b/text()'
     dates = []
     for t in self.html.xpath(path):
         # t looks like: [Argued:91-1-2015]
         t = re.sub(r'[\[\]\s]', '', t)             # Strip out [ and ].
         date_string = clean_if_py3(t).split(':', 1)[1].strip()  # Then get the date part.
         dates.append(datetime.strptime(date_string, '%m-%d-%Y').date())
     return dates

Example #17

0

Show file

    def _get_docket_numbers(self):
        path = '//tr[contains(concat(" ", @class, " "), " sc-opinion ")]/td[1]//text()[normalize-space() != ""]'
        docket_numbers = []
        for el in self.html.xpath(path):
            text = clean_if_py3(str(el)).strip()
            if text:
                docket_numbers.append(text)

        return docket_numbers

Example #18

0

Show file

 def _extract_case_names_from_sub_page(self, html_tree):
     regex = r'(?:%s)' % self.base_anchor_regex
     path = "{base}//text()".format(base=self.base_anchor_path)
     case_names = []
     for el in html_tree.xpath(path):
         txt = clean_if_py3(el).strip()
         if txt:
             case_names.append(re.search(regex, txt).group(1))
     return case_names

Example #19

0

Show file

 def _extract_docket_numbers_from_sub_page(self, html_tree):
     regex = '(%s)' % self.base_anchor_regex
     path = "{base}//text()".format(base=self.base_anchor_path)
     docket_numbers = []
     for el in html_tree.xpath(path):
         txt = clean_if_py3(el).strip()
         if txt:
             docket_numbers.append(re.search(regex, txt).group(1))
     return docket_numbers

Example #20

0

Show file

    def _get_case_dates(self):
        case_dates = []
        for el in self.html.xpath(self.date_path):
            date_string = clean_if_py3(str(el)).strip()
            if date_string:
                case_dates.append(
                    datetime.strptime(date_string, '%b %d, %Y').date())

        return case_dates

Example #21

0

Show file

File: mich.py Project: voutilad/juriscraper

 def _get_dispositions(self):
     disps = []
     for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="caseNature"]/text()')
             disps.append(clean_if_py3(s[0]).strip().split(' ', 2)[2])
         except IndexError:
             disps.append('')
     return disps

Example #22

0

Show file

File: mich.py Project: theophile/juriscraper

 def _get_case_dates(self):
     dates = []
     for txt in self.html.xpath('//li[@class="releaseDate"]/text()'):
         # Release Date: 2/22/2013 --> 2/22/2013
         txt = clean_if_py3(txt).strip().split(" ")[2]
         dates.append(
             date.fromtimestamp(
                 time.mktime(time.strptime(txt.strip(), "%m/%d/%Y"))))
     return dates

Example #23

0

Show file

File: ca4.py Project: cgruppioni/juriscraper

 def _get_case_names(self):
     path = '//tr/td[4]/text()'
     names = []
     for s in self.html.xpath(path):
         s = clean_if_py3(s)
         if s.strip():
             names.append(s)
     logger.info(str(len(names)))
     return names

Example #24

0

Show file

    def _get_summaries(self):
        summaries = []
        path = '//div[@class="feed-item-body"]'
        for e in self.html.xpath(path):
            s = html.tostring(e, method='text', encoding='unicode')
            s = clean_if_py3(s).split('Keywords:')[0]
            summaries.append(s)

        return summaries

Example #25

0

Show file

File: idaho_civil.py Project: johnhawkinson/juriscraper

 def _get_case_names(self):
     case_names = []
     path = '%s/td[3]' % self.base_path
     for cell in self.html.xpath(path):
         name_string = html.tostring(cell, method='text', encoding='unicode')
         name_string = clean_if_py3(name_string).strip()
         if name_string:
             case_names.append(name_string)
     return case_names

Example #26

0

Show file

File: uscfc.py Project: janderse/juriscraper

    def _get_summaries(self):
        summaries = []
        path = '//div[@class="feed-item-body"]'
        for e in self.html.xpath(path):
            s = html.tostring(e, method='text', encoding='unicode')
            s = clean_if_py3(s).split('Keywords:')[0]
            summaries.append(s)

        return summaries

Example #27

0

Show file

File: asbca.py Project: wethepeopleonline/juriscraper

 def parse_column_names(self):
     # Lookup column names and save them for later
     self.columns = dict()
     path = "//table/tr[1]/td"
     i = 1
     for column in self.html.xpath(path):
         colname = clean_if_py3(''.join(column.itertext())).strip()
         self.columns[colname] = i
         i += 1
     return self.columns

Example #28

0

Show file

File: mich.py Project: theophile/juriscraper

 def _get_dispositions(self):
     disps = []
     for el in self.html.xpath(
             "//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="caseNature"]/text()')
             disps.append(clean_if_py3(s[0]).strip().split(" ", 2)[2])
         except IndexError:
             disps.append("")
     return disps

Example #29

0

Show file

File: mich.py Project: voutilad/juriscraper

 def _get_lower_courts(self):
     lower_courts = []
     for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="casedetailsleft"]'
                          '//li[@class="lowerCourt"]/text()')
             lower_courts.append(titlecase(clean_if_py3(s[0]).strip().split(' ', 2)[2]))
         except IndexError:
             lower_courts.append('')
     return lower_courts

Example #30

0

Show file

File: fladistctapp_2.py Project: wethepeopleonline/juriscraper

 def _return_docket_numbers(html_tree):
     path = "//th//a[contains(., '-')]/*/text() | //th//a[contains(text(),'-')]/text()"
     dockets = []
     for text in list(html_tree.xpath(path)):
         # sanitize text and extract docket
         text = clean_if_py3(text).split('/')[0].strip()
         docket = ''.join(text.split())
         if re.match(r'^\w+-\d+$', docket):
             dockets.append(docket)
     return dockets

Example #31

0

Show file

File: mich.py Project: voutilad/juriscraper

 def _get_lower_court_numbers(self):
     nums = []
     for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class = "casedetailsright"]'
                          '//li[@class = "lowerCourt"]/text()')
             nums.append(clean_if_py3(s[0]).strip().split('No. ')[1])
         except IndexError:
             nums.append('')
     return nums

Example #32

0

Show file

File: ind.py Project: freelawproject/juriscraper

 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//dl/dd/dd/dd/text()'):
         date_string = clean_if_py3(date_string).strip()
         if date_string == '':
             dates.append('')
         else:
             dates.append(date.fromtimestamp(
                 time.mktime(time.strptime(date_string, '%m/%d/%y'))))
     return dates

Example #33

0

Show file

File: fladistctapp_2.py Project: voutilad/juriscraper

 def _return_docket_numbers(html_tree):
     path = "//th//a[contains(., '-')]/*/text() | //th//a[contains(text(),'-')]/text()"
     dockets = []
     for text in list(html_tree.xpath(path)):
         # sanitize text and extract docket
         text = clean_if_py3(text).split('/')[0].strip()
         docket = ''.join(text.split())
         if re.match(r'^\w+-\d+$', docket):
             dockets.append(docket)
     return dockets

Example #34

0

Show file

 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//dl/dd/dd/dd/text()'):
         date_string = clean_if_py3(date_string).strip()
         if date_string == '':
             dates.append('')
         else:
             dates.append(
                 date.fromtimestamp(
                     time.mktime(time.strptime(date_string, '%m/%d/%y'))))
     return dates

Example #35

0

Show file

File: idaho_civil.py Project: theophile/juriscraper

 def _get_case_names(self):
     case_names = []
     path = "%s/td[3]" % self.path_base
     for cell in self.html.xpath(path):
         name_string = html.tostring(cell,
                                     method="text",
                                     encoding="unicode")
         name_string = clean_if_py3(name_string).strip()
         if name_string:
             case_names.append(name_string)
     return case_names

Example #36

0

Show file

File: mich.py Project: theophile/juriscraper

 def _get_lower_court_numbers(self):
     nums = []
     for el in self.html.xpath(
             "//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class = "casedetailsright"]'
                          '//li[@class = "lowerCourt"]/text()')
             nums.append(clean_if_py3(s[0]).strip().split("No. ")[1])
         except IndexError:
             nums.append("")
     return nums

Example #37

0

Show file

 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath("//dl/dd/dd/dd/text()"):
         date_string = clean_if_py3(date_string).strip()
         if date_string == "":
             dates.append("")
         else:
             dates.append(
                 date.fromtimestamp(
                     time.mktime(time.strptime(date_string, "%m/%d/%y"))))
     return dates

Example #38

0

Show file

File: mich.py Project: theophile/juriscraper

 def _get_lower_courts(self):
     lower_courts = []
     for el in self.html.xpath(
             "//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="casedetailsleft"]'
                          '//li[@class="lowerCourt"]/text()')
             lower_courts.append(
                 titlecase(clean_if_py3(s[0]).strip().split(" ", 2)[2]))
         except IndexError:
             lower_courts.append("")
     return lower_courts

Example #39

0

Show file

 def _get_docket_numbers(self):
     path = '{base}//text()[normalize-space(.)]'.format(
         base=self.base_path.format(
             table_number=self.table_number,
             i=2 + self.xpath_adjustment,
         ), )
     docket_numbers = []
     for el in self.html.xpath(path):
         text = clean_if_py3(str(el)).strip()
         if text:
             docket_numbers.append(text)
     return docket_numbers

Example #40

0

Show file

File: idaho_civil.py Project: johnhawkinson/juriscraper

 def _get_case_dates(self):
     case_dates = []
     path = '%s/td[1]' % self.base_path
     for cell in self.html.xpath(path):
         date_string = html.tostring(cell, method='text', encoding='unicode')
         date_string = clean_if_py3(date_string).strip()
         if date_string:
             if six.PY2:
                 date_string = date_string.encode('ascii', 'ignore')
             date_string = date_string.replace('Sept ', 'Sep ')  # GIGO!  (+1 by arderyp)
             case_dates.append(convert_date_string(date_string))
     return case_dates

Example #41

0

Show file

 def _get_case_dates(self):
     path = "//item/description/b/text()"
     dates = []
     for t in self.html.xpath(path):
         # t looks like: [Argued:91-1-2015]
         t = re.sub(r"[\[\]\s]", "", t)  # Strip out [ and ].
         date_string = clean_if_py3(t).split(":", 1)[1].strip()
         # sometimes there is a type like: [Argued:91-1-2015mp3]
         # such as in ca1_example_2.xml
         date_string = date_string.replace("mp3", "")
         dates.append(convert_date_string(date_string))
     return dates

Example #42

0

Show file

File: uscfc.py Project: janderse/juriscraper

    def _get_docket_numbers(self):
        docket_numbers = []
        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
            t = clean_if_py3(t)
            if t.strip():
                # If there is something other than whitespace...
                if not isinstance(t, six.string_types):
                    t = str(t, encoding='utf-8')

                if u' • ' in t:
                    t = t.split(u' • ')[0].strip()
                docket_numbers.append(t)
        return docket_numbers

Example #43

0

Show file

File: nev_p.py Project: janderse/juriscraper

 def _get_docket_numbers(self):
     path = '{base}//text()[normalize-space(.)]'.format(
         base=self.base_path.format(
             table_number=self.table_number,
             i=2 + self.xpath_adjustment,
         ),
     )
     docket_numbers = []
     for el in self.html.xpath(path):
         text = clean_if_py3(str(el)).strip()
         if text:
             docket_numbers.append(text)
     return docket_numbers

Example #44

0

Show file

File: uscfc.py Project: janderse/juriscraper

    def _get_case_names(self):
        case_names = []
        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
            t = ' '.join(clean_if_py3(t).split())  # Normalize whitespace
            if t.strip():
                # If there is something other than whitespace...
                if not isinstance(t, six.string_types):
                    t = str(t, encoding='utf-8')

                if u' • ' in t:
                    t = t.split(u' • ')[1].strip()
                t = titlecase(t.lower())
                case_names.append(t)
        return case_names

Example #45

0

Show file

File: ca6.py Project: freelawproject/juriscraper

    def get_nth_table_cell_data(self, n, href=False, link_text=False):
        path = '//table/tr/td[%d]' % n
        if href:
            path += '/a/@href'
        elif link_text:
            path += '/a/text()'
        else:
            path += '/text()'

        results = []
        for data in self.html.xpath(path):
            data = clean_if_py3(data).strip()
            if data:
                results.append(data)
        return results

Example #46

0

Show file

File: nyappterm_1st.py Project: janderse/juriscraper

 def _get_download_urls(self):
     download_urls = []
     for element in self.html.xpath(self.base_path):
         url = ''
         for href in element.xpath('./td[5]//@href'):
             href = clean_if_py3(href)
             # Check for newer standard href
             match = self.href_standard.match(href)
             if match:
                 url = match.group(0)
                 break
             # Check for presence of legacy JavaScript href
             matches = self.href_js.findall(href)
             if matches:
                 url = url + matches[0]
         if url:
             download_urls.append(url)
     return download_urls

Example #47

0

Show file

File: nev_p.py Project: janderse/juriscraper

    def _get_neutral_citations(self):
        neutral_path = '{base}//text()'.format(
            base=self.base_path.format(
                table_number=self.table_number,
                i=1 + self.xpath_adjustment,
            ),
        )

        date_strings = []
        for el in self.html.xpath(self.date_path):
            date_string = clean_if_py3(str(el)).strip()
            if date_string:
                date_strings.append(date_string)

        neutral_citations = []
        for neutral_number, \
            date_string in zip(
                self.html.xpath(neutral_path),
                date_strings):
            year = datetime.strptime(date_string.strip(), '%b %d, %Y').year
            neutral_citations.append('{year} NV {num}'.format(year=year, num=neutral_number))
        return neutral_citations

Example #48

0

Show file

File: iowa.py Project: voutilad/juriscraper

 def _return_docket_numbers(html_tree):
     path = "//*[contains(concat(' ',@id,' '),' wfLabel')]/preceding::tr[2]/td[1]/a/text()"
     return [clean_if_py3(re.sub(r'Nos?.', '', e).strip())
             for e in html_tree.xpath(path)]

Example #49

0

Show file

File: ohio.py Project: freelawproject/juriscraper

 def _get_case_dates(self):
     path = "{base}/following::td[4]//text()".format(base=self.base_path)
     dates = []
     for s in self.html.xpath(path):
         dates.append(datetime.strptime(clean_if_py3(s).strip(), '%m/%d/%Y').date())
     return dates

Example #50

0

Show file

File: ca8.py Project: freelawproject/juriscraper

 def _get_case_names(self):
     case_names = []
     for txt in [item.xpath('./title/text()')[0] for item in self.items]:
         case_name = clean_if_py3(txt).split(': ', 1)[1]
         case_names.append(case_name)
     return case_names

Example #51

0

Show file

File: ca8.py Project: freelawproject/juriscraper

 def _get_docket_numbers(self):
     docket_numbers = []
     for txt in [item.xpath('./title/text()')[0] for item in self.items]:
         docket_number = clean_if_py3(txt).split(': ', 1)[0]
         docket_numbers.append(docket_number)
     return docket_numbers