Example #1
0
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                r = requests.get(
                    url,
                    allow_redirects=True,
                    headers={'User-Agent': 'Juriscraper'},
                )
                r.raise_for_status()

                html_tree = html.fromstring(r.text)
                html_tree.make_links_absolute(self.url)
                plaintiff = html_tree.xpath(
                    "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                )[0]
                defendant = html_tree.xpath(
                    "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                )[0]

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)
Example #2
0
def split_name_title(judge):
    """Split a value from PACER and return the title and name"""
    judge = judge.replace(',', '')
    words = judge.lower().split()

    # Nuke bad junk (punct., j, blacklist, etc.)
    clean_words = []
    for i, w in enumerate(words):
        if any(['(' in w,
                ')' in w,
                w.startswith('-'),
                w.startswith('~'),
                (len(w) > 2 and '.' in w and w not in ['jr.', 'sr.']),
                (i == 0 and w == 'j.'),
                w in blacklist]):
            continue
        clean_words.append(w)

    title_words = []
    name_words = []
    for w in clean_words:
        if w in titles:
            title_words.append(w)
        else:
            name_words.append(w)

    title = normalize_judge_titles(titlecase(' '.join(title_words)))
    name = titlecase(' '.join(name_words))

    return name, title
Example #3
0
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                r = requests.get(
                    url,
                    allow_redirects=True,
                    headers={'User-Agent': 'Juriscraper'},
                )
                r.raise_for_status()

                html_tree = html.fromstring(r.text)
                html_tree.make_links_absolute(self.url)
                plaintiff = html_tree.xpath(
                    "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                )[0]
                defendant = html_tree.xpath(
                    "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                )[0]

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)
Example #4
0
 def _get_case_names(self):
     case_names = []
     for case_string in self.html.xpath('//table[@id = "searchResults"]/tr[position() >= 3]/td[4]/a/text()'):
         # Takes care of things like [ERRATA] that are often on the end of
         # case names.
         case_names.append(titlecase(case_string.split('[')[0]))
     return case_names
def get_case_name(complete_html_tree, case_path):
    path = '//head/title/text()'
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist.
    # Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit('-', 1)[0].rsplit(',', 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]['case_name']
        except KeyError:
            if 'input_case_names' in DEBUG:
                if 'firefox' in DEBUG:
                    subprocess.Popen(
                        ['firefox', 'file://%s' % case_path],
                        shell=False).communicate()
                input_case_name = raw_input(
                    '  No case name found. What should be here? ')
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {'case_name': input_case_name})
                case_name = input_case_name

    if 'case_name' in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Example #6
0
def normalize_address_info(address_info):
    """Normalize various address components"""

    # Titlecase where appropriate
    for k, v in address_info.items():
        if k == 'state':
            continue
        address_info[k] = titlecase(v)

    # Normalize street abbreviations (St --> Street, etc.)
    fixes = OrderedDict((
        ('Street', 'St.'),
        ('Avenue', 'Ave.'),
        ('Boulevard', 'Blvd.'),
    ))
    for address_part in ['address1', 'address2']:
        a = address_info.get(address_part)
        if not a:
            continue

        for bad, good in fixes.items():
            a = re.sub(r'\b%s\b' % bad, good, a, flags=re.IGNORECASE)

        address_info[address_part] = a

    # Nuke any zip code that's longer than allowed in the DB (usually caused by
    # phone numbers)
    zip_code_field = AttorneyOrganization._meta.get_field('zip_code')
    if len(address_info.get('zip_code', '')) > zip_code_field.max_length:
        address_info['zip_code'] = ''
    return address_info
Example #7
0
        def fetcher(url):
            r = requests.get(url,
                             allow_redirects=False,
                             headers={'User-Agent': 'Juriscraper'})
            r.raise_for_status()

            html_tree = html.fromstring(r.text)
            html_tree.make_links_absolute(self.url)
            plaintiff = html_tree.xpath("//text()[contains(., 'Style')]/ancestor::tr[1]/td[2]/text()")[0]
            defendant = html_tree.xpath("//text()[contains(., 'v.:')]/ancestor::tr[1]/td[2]/text()")[0]

            if defendant.strip():
                # If there's a defendant
                return titlecase('%s v. %s' % (plaintiff, defendant))
            else:
                return titlecase(plaintiff)
Example #8
0
 def extract_cases_from_html(self, html):
     paths = '//p/strong | //p/b | //p/font/strong | //p/font/b'
     for date_element in html.xpath(paths):
         string = date_element.xpath('./text()')
         try:
             string = string[0]
             # handle legacy example (ga_example.html)
             string = string.split('SUMMARIES')[0]
             date_string = re.sub(r'\W+', ' ', string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath('./..')[0]
         # handle legacy example (ga_example.html)
         while parent.tag != 'p':
             parent = parent.xpath('./..')[0]
         for item in parent.getnext().xpath('./li'):
             text = item.text_content()
             if text:
                 split = text.split('.', 1)
                 self.cases.append({
                     'date': case_date,
                     'url': item.xpath('//a[1]/@href')[0],
                     'docket': split[0].rstrip('.'),
                     'name': titlecase(split[1]),
                 })
Example #9
0
 def _process_html(self):
     path = ("//a["
             "contains(., 'v.') or "
             "contains(., 'IN RE') or "
             "contains(., 'IN THE') or "
             "contains(., 'vs.') or "
             "contains(., 'VS.')"
             "]")
     for html in self.html:
         for anchor in html.xpath(path):
             date_string = self._get_date_above_anchor(anchor)
             text = anchor.text_content()
             parts = text.split(None, 1)
             summary_lines = anchor.getparent().xpath("./text()")
             self.cases.append({
                 "date":
                 date_string,
                 "docket":
                 parts[0],
                 "judge":
                 self._get_judge_above_anchor(anchor),
                 "name":
                 titlecase(parts[1]),
                 "summary":
                 " ".join(summary_lines).replace(text, ""),
                 "url":
                 anchor.get("href"),
             })
Example #10
0
    def _fetch_case_name(self, case_number):
        """Fetch case name for a given docket number + publication year pair."""

        # If case_number is not expected 12 characters, skip it, since
        # we can't know how to fix the courts typo. They likely forgot
        # to '0' pad the beginning or the end of the 'number' suffix,
        # but we can't know for sure.
        if len(case_number) != 12:
            return False

        url = "https://appellatepublic.kycourts.net/api/api/v1/cases/search"
        self.request["parameters"] = {
            "params": {
                "queryString": "true",
                "searchFields[0].searchType": "Starts With",
                "searchFields[0].operation": "=",
                "searchFields[0].values[0]": case_number,
                "searchFields[0].indexFieldName": "caseNumber",
            }
        }

        self._request_url_get(url)
        json = self.request["response"].json()

        try:
            title = json["resultItems"][0]["rowMap"]["shortTitle"]
        except IndexError:
            return False
        return titlecase(title)
Example #11
0
 def _get_case_names(self):
     case_names = []
     for case_string in self.html.xpath('//table[@id = "searchResults"]/tr[position() >= 3]/td[4]/a/text()'):
         # Takes care of things like [ERRATA] that are often on the end of
         # case names.
         case_names.append(titlecase(case_string.split('[')[0]))
     return case_names
Example #12
0
 def _get_case_names(self):
     titles = []
     path = '//*[@id="content2col"]/table[%s]/tr/td[3][.//a]' % self.table
     for element in self.html.xpath(path):
         title = ' '.join(element.text_content().upper().split())
         titles.append(titlecase(title))
     return titles
Example #13
0
def normalize_address_info(address_info):
    """Normalize various address components"""

    # Titlecase where appropriate
    for k, v in address_info.items():
        if k == "state":
            continue
        address_info[k] = titlecase(v)

    # Normalize street abbreviations (St --> Street, etc.)
    fixes = OrderedDict(
        (("Street", "St."), ("Avenue", "Ave."), ("Boulevard", "Blvd.")))
    for address_part in ["address1", "address2"]:
        a = address_info.get(address_part)
        if not a:
            continue

        for bad, good in fixes.items():
            a = re.sub(r"\b%s\b" % bad, good, a, flags=re.IGNORECASE)

        address_info[address_part] = a

    # Nuke any zip code that's longer than allowed in the DB (usually caused by
    # phone numbers)
    zip_code_field = AttorneyOrganization._meta.get_field("zip_code")
    if len(address_info.get("zip_code", "")) > zip_code_field.max_length:
        address_info["zip_code"] = ""
    return address_info
Example #14
0
    def _fetch_case_name(self, case_number):
        """Fetch case name for a given docket number + publication year pair.

        Some resources show 'Public Access Restricted' messages and do not
        provide parseable case name information.  These will be skipped by
        our system by returning False below.  The only other approach would
        be to parse the case name from the raw PDF text itself.
        """

        # If case_number is not expected 12 characters, skip it, since
        # we can't know how to fix the courts typo. They likely forgot
        # to '0' pad the beginning or the end of the 'number' suffix,
        # but we can't know for sure.
        if len(case_number) != 12:
            return False

        # Site has non-chained, bad certificate, need to
        # ignore ssl verification for now for scraper to work
        self.request['verify'] = False

        url = 'https://appellate.kycourts.net/SC/SCDockets/CaseDetails.aspx?cn=%s' % case_number
        html = self._get_html_tree_by_url(url)

        # Halt if there is a (dismissible) error/warning on the page
        path_error_warning = '//div[contains(@class, "alert-dismissible")]'
        if html.xpath(path_error_warning):
            raise InsanityException('Invalid sub-resource url (%s). Is case number (%s) invalid?' % (url, case_number))

        # Ensure that only two substrings are present
        path_party = '//td[@class="party"]/text()'
        parties = html.xpath(path_party)
        if len(parties) != 2:
            raise InsanityException('Unexpected party elements. Expected two substrings, got: %s' % ', '.join(parties))

        return titlecase(' v. '.join(parties))
Example #15
0
 def _get_case_names(self):
     path = "//table[@id = 'ContentPlaceHolder1_PageContent_gvOpinions']//tr[position() > 1]/td/a[contains(@href, 'pdf')]/text()"
     case_names = []
     for s in self.html.xpath(path):
         case_name = re.search('(.*)(\d{4} S\.?D\.? \d{1,4})', s, re.MULTILINE).group(1)
         case_names.append(titlecase(case_name.upper()))
     return case_names
Example #16
0
 def extract_cases_from_html(self, html):
     paths = '//p/strong | //p/b | //p/font/strong | //p/font/b'
     for date_element in html.xpath(paths):
         string = date_element.xpath('./text()')
         try:
             string = string[0]
             # handle examples where time but no date (ga_example_3.html)
             if ':' in string and ('AM' in string or 'PM' in string):
                 continue
             # handle legacy example (ga_example.html)
             string = string.split('SUMMARIES')[0]
             date_string = re.sub(r'\W+', ' ', string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath('./..')[0]
         # handle legacy example (ga_example.html)
         while parent.tag != 'p':
             parent = parent.xpath('./..')[0]
         for item in parent.getnext().xpath('./li'):
             text = item.text_content()
             if text:
                 split = text.split('.', 1)
                 self.cases.append({
                     'date': case_date,
                     'url': item.xpath('//a[1]/@href')[0],
                     'docket': split[0].rstrip('.'),
                     'name': titlecase(split[1]),
                 })
    def _return_case_names(self, html_tree):
        path = "%s/td[2]" % self.base_path
        cells = html_tree.xpath(path)
        names = [c.text_content().strip() for c in cells]

        # Return formatted text for non-empty cells
        return [titlecase(n.lower()) for n in names if n]
Example #18
0
 def _get_case_names(self):
     path = "{base}/following::ul[1]//li//a[1]/text()".format(
         base=self.base_path)
     return [
         titlecase(self.regex.search(s).group(2).lower())
         for s in self.html.xpath(path)
     ]
Example #19
0
def normalize_address_info(address_info):
    """Normalize various address components"""

    # Titlecase where appropriate
    for k, v in address_info.items():
        if k == 'state':
            continue
        address_info[k] = titlecase(v)

    # Normalize street abbreviations (St --> Street, etc.)
    fixes = OrderedDict((
        ('Street', 'St.'),
        ('Avenue', 'Ave.'),
        ('Boulevard', 'Blvd.'),
    ))
    for address_part in ['address1', 'address2']:
        a = address_info.get(address_part)
        if not a:
            continue

        for bad, good in fixes.items():
            a = re.sub(r'\b%s\b' % bad, good, a, flags=re.IGNORECASE)

        address_info[address_part] = a

    # Nuke any zip code that's longer than allowed in the DB (usually caused by
    # phone numbers)
    zip_code_field = AttorneyOrganization._meta.get_field('zip_code')
    if len(address_info.get('zip_code', '')) > zip_code_field.max_length:
        address_info['zip_code'] = ''
    return address_info
Example #20
0
    def _return_case_names(self, html_tree):
        path = "%s/td[2]" % self.base_path
        cells = html_tree.xpath(path)
        names = [c.text_content().strip() for c in cells]

        # Return formatted text for non-empty cells
        return [titlecase(n.lower()) for n in names if n]
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '(%%';''')

    for doc in docs:
        # Special cases
        if 'Klein' in doc.case_name:
            continue
        elif 'in re' in doc.case_name.lower():
            continue
        elif doc.case_name == "(White) v. Gray":
            doc.case_name = "White v. Gray"
            if not simulate:
                doc.save()


        # Otherwise, we nuke the leading parens.
        old_case_name = doc.case_name
        new_case_name = titlecase(harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1))))

        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)
            print "        New for %s: %s\n" % (doc.pk, new_case_name)

        if not simulate:
            doc.case_name = new_case_name
            doc.citation.save()
Example #22
0
 def extract_cases_from_html(self, html):
     paths = "//p/strong | //p/b | //p/font/strong | //p/font/b"
     for date_element in html.xpath(paths):
         string = date_element.xpath("./text()")
         try:
             string = string[0]
             # handle examples where time but no date (ga_example_3.html)
             if ":" in string and ("AM" in string or "PM" in string):
                 continue
             # handle legacy example (ga_example.html)
             string = string.split("SUMMARIES")[0]
             date_string = re.sub(r"\W+", " ", string)
             # handle legacy example (ga_example.html)
             if len(date_string.split()) != 3:
                 continue
             case_date = convert_date_string(date_string)
         except:
             continue
         parent = date_element.xpath("./..")[0]
         # handle legacy example (ga_example.html)
         while parent.tag != "p":
             parent = parent.xpath("./..")[0]
         for item in parent.getnext().xpath("./li"):
             text = item.text_content()
             if text:
                 split = text.split(".", 1)
                 self.cases.append({
                     "date": case_date,
                     "url": item.xpath("//a[1]/@href")[0],
                     "docket": split[0].rstrip("."),
                     "name": titlecase(split[1]),
                 })
Example #23
0
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '(%%';''')

    for doc in docs:
        # Special cases
        if 'Klein' in doc.case_name:
            continue
        elif 'in re' in doc.case_name.lower():
            continue
        elif doc.case_name == "(White) v. Gray":
            doc.case_name = "White v. Gray"
            if not simulate:
                doc.save()

        # Otherwise, we nuke the leading parens.
        old_case_name = doc.case_name
        new_case_name = titlecase(
            harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1))))

        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)
            print "        New for %s: %s\n" % (doc.pk, new_case_name)

        if not simulate:
            doc.case_name = new_case_name
            doc.citation.save()
Example #24
0
 def _get_lower_courts(self):
     lower_courts = []
     for s in self.html.xpath('//li[@class = "casedetailsleft"]//li[@class="lowerCourt"]/text()'):
         try:
             lower_courts.append(titlecase(s.strip().split(' ', 2)[2]))
         except IndexError:
             lower_courts.append('')
     return lower_courts
Example #25
0
 def _get_dispositions(self):
     docket_numbers = []
     for e in self.html.xpath(
             '//tr/td[@class="DocumentBrowserCell"][8][../*[4]//text()]//nobr'
     ):
         s = html.tostring(e, method='text', encoding='unicode')
         docket_numbers.append(titlecase(s))
     return docket_numbers
Example #26
0
 def _get_case_names(self):
     case_names = []
     for case_name in self.html.xpath('//li[@class="title1" and not(ancestor::ul[@class="result-header"])]/a/text()'):
         case_name = titlecase(case_name)
         if 'People of Mi ' in case_name:
             case_name = case_name.replace('People of Mi ', 'People of Michigan ')
         case_names.append(case_name)
     return case_names
Example #27
0
 def _get_case_names(self):
     path = "%s/a[contains(@href, 'pdf')]/text()" % self.base_path
     case_names = []
     for s in self.html.xpath(path):
         case_name = self.extract_regex_group(1, s)
         if not case_name:
             case_name = self.extract_regex_group(2, s, True)
         case_names.append(titlecase(case_name.upper()))
     return case_names
Example #28
0
 def _get_case_names(self):
     path = "%s/a[contains(@href, 'pdf')]/text()" % self.base_path
     case_names = []
     for s in self.html.xpath(path):
         case_name = self.extract_regex_group(1, s)
         if not case_name:
             case_name = self.extract_regex_group(2, s, True)
         case_names.append(titlecase(case_name.upper()))
     return case_names
Example #29
0
 def _get_lower_courts(self):
     lower_courts = []
     for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="casedetailsleft"]'
                          '//li[@class="lowerCourt"]/text()')
             lower_courts.append(titlecase(clean_if_py3(s[0]).strip().split(' ', 2)[2]))
         except IndexError:
             lower_courts.append('')
     return lower_courts
Example #30
0
 def _get_case_names(self):
     case_names = []
     for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
         t = ' '.join(t.split())  # Normalize whitespace
         if t.strip():
             # If there is something other than whitespace...
             t = t.encode('utf8').split(' • ')[1].strip()
             t = titlecase(t.lower())
             case_names.append(t)
     return case_names
Example #31
0
 def _get_lower_courts(self):
     lower_courts = []
     for el in self.html.xpath('//ul[@class="odd" or @class="even"]'):
         try:
             s = el.xpath('//li[@class="casedetailsleft"]'
                          '//li[@class="lowerCourt"]/text()')
             lower_courts.append(titlecase(s[0].strip().split(' ', 2)[2]))
         except IndexError:
             lower_courts.append('')
     return lower_courts
Example #32
0
 def _get_lower_courts(self):
     lower_courts = []
     for el in self.html.xpath(
             "//ul[contains(@class, 'odd') or contains(@class, 'even')]"):
         try:
             s = el.xpath('//li[@class="casedetailsleft"]'
                          '//li[@class="lowerCourt"]/text()')
             lower_courts.append(
                 titlecase(clean_if_py3(s[0]).strip().split(" ", 2)[2]))
         except IndexError:
             lower_courts.append("")
     return lower_courts
Example #33
0
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                plaintiff = ''
                defendant = ''
                try:
                    plaintiff = html_tree.xpath(
                        "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                    defendant = html_tree.xpath(
                        "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                except IndexError:
                    logger.warn("No title or defendant found for {}".format(url))

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)
Example #34
0
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                plaintiff = ''
                defendant = ''
                try:
                    plaintiff = html_tree.xpath(
                        "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                    defendant = html_tree.xpath(
                        "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                except IndexError:
                    logger.warn("No title or defendant found for {}".format(url))

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)
Example #35
0
 def _get_case_names(self):
     casenames = []
     rowpath = "//table[2]//tr[position()>0]"
     cnpath = "./td[2]//text()[preceding-sibling::br]"
     urlpath = "./td[3]/a/@href"
     for row in self.html.xpath(rowpath):
         case_list = row.xpath(cnpath)
         for rough_case_name in case_list:
             case_name = titlecase(rough_case_name.lower())
             # Determine the number of urls in each row and pad the case
             # name list sufficiently
             count = len(row.xpath(urlpath))
             casenames.extend([case_name] * count)
     return casenames
Example #36
0
    def _get_case_names(self):
        case_names = []
        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
            t = ' '.join(clean_if_py3(t).split())  # Normalize whitespace
            if t.strip():
                # If there is something other than whitespace...
                if not isinstance(t, six.string_types):
                    t = str(t, encoding='utf-8')

                if u' • ' in t:
                    t = t.split(u' • ')[1].strip()
                t = titlecase(t.lower())
                case_names.append(t)
        return case_names
Example #37
0
 def _get_case_names(self):
     casenames = []
     rowpath = '//table[2]//tr[position()>0]'
     cnpath = './td[2]//text()[preceding-sibling::br]'
     urlpath = './td[3]/a/@href'
     for row in self.html.xpath(rowpath):
         case_list = row.xpath(cnpath)
         for rough_case_name in case_list:
             case_name = titlecase(rough_case_name.lower())
             # Determine the number of urls in each row and pad the case
             # name list sufficiently
             count = len(row.xpath(urlpath))
             casenames.extend([case_name] * count)
     return casenames
Example #38
0
    def _get_case_names(self):
        case_names = []
        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
            t = ' '.join(clean_if_py3(t).split())  # Normalize whitespace
            if t.strip():
                # If there is something other than whitespace...
                if not isinstance(t, six.string_types):
                    t = str(t, encoding='utf-8')

                if u' • ' in t:
                    t = t.split(u' • ')[1].strip()
                t = titlecase(t.lower())
                case_names.append(t)
        return case_names
Example #39
0
    def _process_html(self):
        paths = "//p/strong | //p/b | //p/font/strong | //p/font/b"
        for date_element in self.html.xpath(paths):
            string = date_element.xpath("./text()")
            try:
                string = string[0]
                # handle examples where time but no date (ga_example_3.html)
                if ":" in string and ("AM" in string or "PM" in string):
                    continue
                # handle legacy example (ga_example.html)
                string = string.split("SUMMARIES")[0]
                date_string = re.sub(r"\W+", " ", string)
                # handle legacy example (ga_example.html)
                if len(date_string.split()) != 3:
                    continue
            except:
                continue
            parent = date_element.xpath("./..")[0]
            # handle legacy example (ga_example.html)
            while parent.tag != "p":
                parent = parent.xpath("./..")[0]
            for item in parent.getnext().xpath("./li"):
                text = item.text_content()
                if text:
                    # Extract Docket numbers
                    dockets = re.findall(self.regex_docket, text)
                    if not dockets:
                        raise InsanityException(
                            "Could not find docket numbers in: 's'" % text)

                    # Extract name substring; I am sure this could
                    # be done with a more slick regex, but its not
                    # my forte...
                    name = text
                    for docket in dockets:
                        name = name.replace(docket, "")
                    name = name.lstrip(" .,")

                    self.cases.append({
                        "date": date_string,
                        "docket": ", ".join(dockets),
                        "name": titlecase(name.lstrip(" .,")),
                        "url": item.xpath(".//a[1]/@href")[0],
                    })
Example #40
0
    def _get_case_names(self):
        """ This example demonstrates how to extract text from an element that
            may contain other elements.

            For example, this will work well on something like:
               <strong>Nadim v. <em>Jenny</em></strong>

            Resulting in text like:
               Nadim v. Jenny

            Note that titlecase() should be used here in the case that the case
            names are provided in uppercase. Use the titlecase function on
            cases where the name is provided in uppercase only.
        """
        case_names = []
        for e in self.html.xpath('//path/to/an/element/p'):
            s = html.tostring(e, method='text', encoding='unicode')
            case_names.append(titlecase(s))
        return case_names
Example #41
0
        def fetcher(e):
            """This reaches out to a secondary system and scrapes the correct
             info.
             """
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                url = 'http://162.114.92.78/dockets/SearchCaseDetail.asp'
                anchor_text = html.tostring(e,
                                            method='text',
                                            encoding='unicode')
                m = self.docket_number_regex.search(anchor_text)

                r = requests.post(
                    url,
                    headers={'User-Agent': 'Juriscraper'},
                    data={
                        'txtyear': m.group('year'),
                        'txtcasenumber': m.group('docket_num').strip('0'),
                        'cmdnamesearh': 'Search',
                    },
                )

                # Throw an error if a bad status code is returned.
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)

                # And finally, we parse out the good stuff.
                parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()"
                case_name_parts = []
                for s in html_tree.xpath(parties_path):
                    if s.strip():
                        case_name_parts.append(titlecase(s.strip().lower()))
                    if len(case_name_parts) == 2:
                        break
                return ' v. '.join(case_name_parts)
Example #42
0
    def _add_case_names(self, html_tree):
        case_names = []
        for element in self._get_links_from_html(html_tree):
            text = ' '.join(list(element.xpath(".//text()")))
            text = ' '.join(text.split())
            if text:
                try:
                    case_name = re.search(r'(\d+ ?-\w{1,2} ?- ?\d+)(?!.*\d+ ?-\w{1,2} ?- ?\d+)\s*(.*)', text).group(2)
                    if case_name:
                        pass
                    else:
                        name_element = element.xpath("./parent::p[1]/text()")
                        text = ' '.join(name_element.split())
                        case_name = re.search(r'(\w+.+)+', text).group(1)
                except AttributeError:
                    case_name = re.search(r'(\w+.+)+', text).group(1)
                case_names.append(titlecase(case_name))

        return case_names
Example #43
0
    def _get_case_names(self):
        """ This example demonstrates how to extract text from an element that
            may contain other elements.

            For example, this will work well on something like:
               <strong>Nadim v. <em>Jenny</em></strong>

            Resulting in text like:
               Nadim v. Jenny

            Note that titlecase() should be used here in the case that the case
            names are provided in uppercase. Use the titlecase function on
            cases where the name is provided in uppercase only.
        """
        case_names = []
        for e in self.html.xpath('//path/to/an/element/p'):
            s = html.tostring(e, method='text', encoding='unicode')
            case_names.append(titlecase(s))
        return case_names
Example #44
0
    def _add_case_names(html_tree):
        case_names = []
        for element in html_tree.xpath("//p[contains(., 'v.') or contains(., 'IN RE') or "
                                       "contains(., 'IN THE') or contains(., 'vs.')]//a"):
            text = ' '.join(list(element.xpath(".//text()")))
            text = ' '.join(text.split())
            if text:
                try:
                    case_name = re.search('(\d+ ?-\w{1,2} ?- ?\d+)(?!.*\d+ ?-\w{1,2} ?- ?\d+)\s*(.*)', text).group(2)
                    if case_name:
                        pass
                    else:
                        name_element = element.xpath("./parent::p[1]/text()")
                        text = ' '.join(name_element.split())
                        case_name = re.search('(\w+.+)+', text).group(1)
                except AttributeError:
                    case_name = re.search('(\w+.+)+', text).group(1)
                case_names.append(titlecase(case_name))

        return case_names
Example #45
0
        def fetcher(e):
            """This reaches out to a secondary system and scrapes the correct
             info.
             """
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                url = 'http://162.114.92.78/dockets/SearchCaseDetail.asp'
                anchor_text = html.tostring(e, method='text', encoding='unicode')
                m = self.docket_number_regex.search(anchor_text)

                r = requests.post(
                    url,
                    headers={'User-Agent': 'Juriscraper'},
                    data={
                        'txtyear': m.group('year'),
                        'txtcasenumber': m.group('docket_num').strip('0'),
                        'cmdnamesearh': 'Search',
                    },
                )

                # Throw an error if a bad status code is returned.
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)

                # And finally, we parse out the good stuff.
                parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()"
                case_name_parts = []
                for s in html_tree.xpath(parties_path):
                    if s.strip():
                        case_name_parts.append(titlecase(s.strip().lower()))
                    if len(case_name_parts) == 2:
                        break
                return ' v. '.join(case_name_parts)
Example #46
0
def get_case_name(complete_html_tree, case_path):
    path = "//head/title/text()"
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit("-", 1)[0].rsplit(",", 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]["case_name"]
        except KeyError:
            if "input_case_names" in DEBUG:
                if "firefox" in DEBUG:
                    subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                input_case_name = raw_input("  No case name found. What should be here? ")
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {"case_name": input_case_name})
                case_name = input_case_name

    if "case_name" in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Example #47
0
    def _extract_cases_from_html(self, html):
        """Build list of data dictionaries, one dictionary per case (table row)."""
        self.cases = []

        for row in html.xpath('//table/tbody/tr'):
            date, docket, url, name, status = False, False, False, False, False

            date = convert_date_string(row.xpath('td[1]/span/text()')[0])
            docket = row.xpath('td[2]/text()')[0].strip()

            url_raw = row.xpath('td[4]/a/@href')
            if url_raw:
                url = url_raw[0]

            name_raw = row.xpath('td[4]/a/text()')
            if name_raw:
                name = titlecase(name_raw[0].split('[')[0].strip())

            status_raw = row.xpath('td[5]/text()')
            if status_raw:
                status_raw = status_raw[0].strip().lower()
                if 'nonprecedential' in status_raw.lower():
                    status = 'Unpublished'
                elif 'precedential' in status_raw.lower():
                    status = 'Published'
                else:
                    status = 'Unknown'

            if date and docket and url and name and status:
                self.cases.append({
                    'date': date,
                    'docket': docket,
                    'url': url,
                    'name': name,
                    'status': status,
                })
Example #48
0
 def _get_case_names(self):
     path = '{base}/td[1]/a/text()'.format(base=self.base)
     return [titlecase(text) for text in self.html.xpath(path)]
Example #49
0
 def _get_dispositions(self):
     path = "{base}/td[4]/text()".format(base=self.base_path)
     return [titlecase(e) for e in self.html.xpath(path)]
Example #50
0
 def _get_case_names(self):
     path = "{base}/td[2]/text()".format(base=self.base_path)
     return [titlecase(e) for e in self.html.xpath(path)]
Example #51
0
 def _get_judges(self):
     path = "//*[contains(concat(' ',@id,' '),' case_panel')]/text()"
     return [titlecase(s.lower()) for s in self.html.xpath(path)]
Example #52
0
 def test_titlecase(self):
     """Tests various inputs for the titlecase function"""
     test_pairs = [
         ["Q&A with steve jobs: 'that's what happens in technology'",
          u"Q&A With Steve Jobs: 'That's What Happens in Technology'"],
         ["What is AT&T's problem?",
          u"What is AT&T's Problem?"],
         ['Apple deal with AT&T falls through',
          u'Apple Deal With AT&T Falls Through'],
         ['this v that',
          u'This v That'],
         ['this v. that',
          u'This v. That'],
         ['this vs that',
          u'This vs That'],
         ['this vs. that',
          u'This vs. That'],
         ["The SEC's Apple Probe: What You Need to Know",
          u"The SEC's Apple Probe: What You Need to Know"],
         ["'by the Way, small word at the start but within quotes.'",
          u"'By the Way, Small Word at the Start but Within Quotes.'"],
         ['Small word at end is nothing to be afraid of',
          u'Small Word at End is Nothing to Be Afraid Of'],
         ['Starting Sub-Phrase With a Small Word: a Trick, Perhaps?',
          u'Starting Sub-Phrase With a Small Word: A Trick, Perhaps?'],
         ["Sub-Phrase With a Small Word in Quotes: 'a Trick, Perhaps?'",
          u"Sub-Phrase With a Small Word in Quotes: 'A Trick, Perhaps?'"],
         ['Sub-Phrase With a Small Word in Quotes: "a Trick, Perhaps?"',
          u'Sub-Phrase With a Small Word in Quotes: "A Trick, Perhaps?"'],
         ['"Nothing to Be Afraid of?"',
          u'"Nothing to Be Afraid Of?"'],
         ['"Nothing to be Afraid Of?"',
          u'"Nothing to Be Afraid Of?"'],
         ['a thing',
          u'A Thing'],
         ["2lmc Spool: 'gruber on OmniFocus and vapo(u)rware'",
          u"2lmc Spool: 'Gruber on OmniFocus and Vapo(u)rware'"],
         ['this is just an example.com',
          u'This is Just an example.com'],
         ['this is something listed on del.icio.us',
          u'This is Something Listed on del.icio.us'],
         ['iTunes should be unmolested',
          u'iTunes Should Be Unmolested'],
         ['Reading between the lines of steve jobs’s ‘thoughts on music’',
          # Tests unicode
          u'Reading Between the Lines of Steve Jobs’s ‘thoughts on Music’'],
         ['seriously, ‘repair permissions’ is voodoo',  # Tests unicode
          u'Seriously, ‘repair Permissions’ is Voodoo'],
         [
             'generalissimo francisco franco: still dead; kieren McCarthy: '
             'still a jackass',
             u'Generalissimo Francisco Franco: Still Dead; Kieren McCarthy:'
             u' Still a Jackass'],
         ['Chapman v. u.s. Postal Service',
          u'Chapman v. U.S. Postal Service'],
         ['Spread Spectrum Screening Llc. v. Eastman Kodak Co.',
          u'Spread Spectrum Screening LLC. v. Eastman Kodak Co.'],
         [
             'Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear '
             'Indian Point 2, Llc.',
             u'Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear'
             u' Indian Point 2, LLC.'],
         ['Infosint s.a. v. H. Lundbeck A/s',
          u'Infosint S.A. v. H. Lundbeck A/S'],
         ["KEVIN O'CONNELL v. KELLY HARRINGTON",
          u"Kevin O'Connell v. Kelly Harrington"],
         ['International Union of Painter v. J&r Flooring, Inc',
          u'International Union of Painter v. J&R Flooring, Inc'],
         [
             'DOROTHY L. BIERY, and JERRAMY and ERIN PANKRATZ v. THE UNITED'
             ' STATES 07-693L And',
             u'Dorothy L. Biery, and Jerramy and Erin Pankratz v. the '
             u'United States 07-693l And'],
         ['CARVER v. US',
          u'Carver v. US']]
     for pair in test_pairs:
         self.assertEqual(titlecase(force_unicode(pair[0])),
                          pair[1])
Example #53
0
 def _get_case_names(self):
     return [titlecase(t) for t in self.html.xpath('//table/td[2]/text()')]
Example #54
0
def format_case_name(n):
    """Applies standard harmonization methods after normalizing with lowercase."""
    return titlecase(harmonize(n.lower()))