def fetcher(url): if self.method == 'LOCAL': return "No case names fetched during tests." else: r = requests.get( url, allow_redirects=True, headers={'User-Agent': 'Juriscraper'}, ) r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) plaintiff = html_tree.xpath( "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] defendant = html_tree.xpath( "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff)
def split_name_title(judge): """Split a value from PACER and return the title and name""" judge = judge.replace(',', '') words = judge.lower().split() # Nuke bad junk (punct., j, blacklist, etc.) clean_words = [] for i, w in enumerate(words): if any(['(' in w, ')' in w, w.startswith('-'), w.startswith('~'), (len(w) > 2 and '.' in w and w not in ['jr.', 'sr.']), (i == 0 and w == 'j.'), w in blacklist]): continue clean_words.append(w) title_words = [] name_words = [] for w in clean_words: if w in titles: title_words.append(w) else: name_words.append(w) title = normalize_judge_titles(titlecase(' '.join(title_words))) name = titlecase(' '.join(name_words)) return name, title
def _get_case_names(self): case_names = [] for case_string in self.html.xpath('//table[@id = "searchResults"]/tr[position() >= 3]/td[4]/a/text()'): # Takes care of things like [ERRATA] that are often on the end of # case names. case_names.append(titlecase(case_string.split('[')[0])) return case_names
def get_case_name(complete_html_tree, case_path): path = '//head/title/text()' # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. # Court, ED Pennsylvania 1979' s = complete_html_tree.xpath(path)[0].rsplit('-', 1)[0].rsplit(',', 1)[0] # returns 'In re 221A Holding Corp., Inc.' case_name = harmonize(clean_string(titlecase(s))) if not s: try: case_name = fixes[case_path]['case_name'] except KeyError: if 'input_case_names' in DEBUG: if 'firefox' in DEBUG: subprocess.Popen( ['firefox', 'file://%s' % case_path], shell=False).communicate() input_case_name = raw_input( ' No case name found. What should be here? ') input_case_name = unicode(input_case_name) add_fix(case_path, {'case_name': input_case_name}) case_name = input_case_name if 'case_name' in DEBUG: log_print(" Case name: %s" % case_name) return case_name
def normalize_address_info(address_info): """Normalize various address components""" # Titlecase where appropriate for k, v in address_info.items(): if k == 'state': continue address_info[k] = titlecase(v) # Normalize street abbreviations (St --> Street, etc.) fixes = OrderedDict(( ('Street', 'St.'), ('Avenue', 'Ave.'), ('Boulevard', 'Blvd.'), )) for address_part in ['address1', 'address2']: a = address_info.get(address_part) if not a: continue for bad, good in fixes.items(): a = re.sub(r'\b%s\b' % bad, good, a, flags=re.IGNORECASE) address_info[address_part] = a # Nuke any zip code that's longer than allowed in the DB (usually caused by # phone numbers) zip_code_field = AttorneyOrganization._meta.get_field('zip_code') if len(address_info.get('zip_code', '')) > zip_code_field.max_length: address_info['zip_code'] = '' return address_info
def fetcher(url): r = requests.get(url, allow_redirects=False, headers={'User-Agent': 'Juriscraper'}) r.raise_for_status() html_tree = html.fromstring(r.text) html_tree.make_links_absolute(self.url) plaintiff = html_tree.xpath("//text()[contains(., 'Style')]/ancestor::tr[1]/td[2]/text()")[0] defendant = html_tree.xpath("//text()[contains(., 'v.:')]/ancestor::tr[1]/td[2]/text()")[0] if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff)
def extract_cases_from_html(self, html): paths = '//p/strong | //p/b | //p/font/strong | //p/font/b' for date_element in html.xpath(paths): string = date_element.xpath('./text()') try: string = string[0] # handle legacy example (ga_example.html) string = string.split('SUMMARIES')[0] date_string = re.sub(r'\W+', ' ', string) # handle legacy example (ga_example.html) if len(date_string.split()) != 3: continue case_date = convert_date_string(date_string) except: continue parent = date_element.xpath('./..')[0] # handle legacy example (ga_example.html) while parent.tag != 'p': parent = parent.xpath('./..')[0] for item in parent.getnext().xpath('./li'): text = item.text_content() if text: split = text.split('.', 1) self.cases.append({ 'date': case_date, 'url': item.xpath('//a[1]/@href')[0], 'docket': split[0].rstrip('.'), 'name': titlecase(split[1]), })
def _process_html(self): path = ("//a[" "contains(., 'v.') or " "contains(., 'IN RE') or " "contains(., 'IN THE') or " "contains(., 'vs.') or " "contains(., 'VS.')" "]") for html in self.html: for anchor in html.xpath(path): date_string = self._get_date_above_anchor(anchor) text = anchor.text_content() parts = text.split(None, 1) summary_lines = anchor.getparent().xpath("./text()") self.cases.append({ "date": date_string, "docket": parts[0], "judge": self._get_judge_above_anchor(anchor), "name": titlecase(parts[1]), "summary": " ".join(summary_lines).replace(text, ""), "url": anchor.get("href"), })
def _fetch_case_name(self, case_number): """Fetch case name for a given docket number + publication year pair.""" # If case_number is not expected 12 characters, skip it, since # we can't know how to fix the courts typo. They likely forgot # to '0' pad the beginning or the end of the 'number' suffix, # but we can't know for sure. if len(case_number) != 12: return False url = "https://appellatepublic.kycourts.net/api/api/v1/cases/search" self.request["parameters"] = { "params": { "queryString": "true", "searchFields[0].searchType": "Starts With", "searchFields[0].operation": "=", "searchFields[0].values[0]": case_number, "searchFields[0].indexFieldName": "caseNumber", } } self._request_url_get(url) json = self.request["response"].json() try: title = json["resultItems"][0]["rowMap"]["shortTitle"] except IndexError: return False return titlecase(title)
def _get_case_names(self): titles = [] path = '//*[@id="content2col"]/table[%s]/tr/td[3][.//a]' % self.table for element in self.html.xpath(path): title = ' '.join(element.text_content().upper().split()) titles.append(titlecase(title)) return titles
def normalize_address_info(address_info): """Normalize various address components""" # Titlecase where appropriate for k, v in address_info.items(): if k == "state": continue address_info[k] = titlecase(v) # Normalize street abbreviations (St --> Street, etc.) fixes = OrderedDict( (("Street", "St."), ("Avenue", "Ave."), ("Boulevard", "Blvd."))) for address_part in ["address1", "address2"]: a = address_info.get(address_part) if not a: continue for bad, good in fixes.items(): a = re.sub(r"\b%s\b" % bad, good, a, flags=re.IGNORECASE) address_info[address_part] = a # Nuke any zip code that's longer than allowed in the DB (usually caused by # phone numbers) zip_code_field = AttorneyOrganization._meta.get_field("zip_code") if len(address_info.get("zip_code", "")) > zip_code_field.max_length: address_info["zip_code"] = "" return address_info
def _fetch_case_name(self, case_number): """Fetch case name for a given docket number + publication year pair. Some resources show 'Public Access Restricted' messages and do not provide parseable case name information. These will be skipped by our system by returning False below. The only other approach would be to parse the case name from the raw PDF text itself. """ # If case_number is not expected 12 characters, skip it, since # we can't know how to fix the courts typo. They likely forgot # to '0' pad the beginning or the end of the 'number' suffix, # but we can't know for sure. if len(case_number) != 12: return False # Site has non-chained, bad certificate, need to # ignore ssl verification for now for scraper to work self.request['verify'] = False url = 'https://appellate.kycourts.net/SC/SCDockets/CaseDetails.aspx?cn=%s' % case_number html = self._get_html_tree_by_url(url) # Halt if there is a (dismissible) error/warning on the page path_error_warning = '//div[contains(@class, "alert-dismissible")]' if html.xpath(path_error_warning): raise InsanityException('Invalid sub-resource url (%s). Is case number (%s) invalid?' % (url, case_number)) # Ensure that only two substrings are present path_party = '//td[@class="party"]/text()' parties = html.xpath(path_party) if len(parties) != 2: raise InsanityException('Unexpected party elements. Expected two substrings, got: %s' % ', '.join(parties)) return titlecase(' v. '.join(parties))
def _get_case_names(self): path = "//table[@id = 'ContentPlaceHolder1_PageContent_gvOpinions']//tr[position() > 1]/td/a[contains(@href, 'pdf')]/text()" case_names = [] for s in self.html.xpath(path): case_name = re.search('(.*)(\d{4} S\.?D\.? \d{1,4})', s, re.MULTILINE).group(1) case_names.append(titlecase(case_name.upper())) return case_names
def extract_cases_from_html(self, html): paths = '//p/strong | //p/b | //p/font/strong | //p/font/b' for date_element in html.xpath(paths): string = date_element.xpath('./text()') try: string = string[0] # handle examples where time but no date (ga_example_3.html) if ':' in string and ('AM' in string or 'PM' in string): continue # handle legacy example (ga_example.html) string = string.split('SUMMARIES')[0] date_string = re.sub(r'\W+', ' ', string) # handle legacy example (ga_example.html) if len(date_string.split()) != 3: continue case_date = convert_date_string(date_string) except: continue parent = date_element.xpath('./..')[0] # handle legacy example (ga_example.html) while parent.tag != 'p': parent = parent.xpath('./..')[0] for item in parent.getnext().xpath('./li'): text = item.text_content() if text: split = text.split('.', 1) self.cases.append({ 'date': case_date, 'url': item.xpath('//a[1]/@href')[0], 'docket': split[0].rstrip('.'), 'name': titlecase(split[1]), })
def _return_case_names(self, html_tree): path = "%s/td[2]" % self.base_path cells = html_tree.xpath(path) names = [c.text_content().strip() for c in cells] # Return formatted text for non-empty cells return [titlecase(n.lower()) for n in names if n]
def _get_case_names(self): path = "{base}/following::ul[1]//li//a[1]/text()".format( base=self.base_path) return [ titlecase(self.regex.search(s).group(2).lower()) for s in self.html.xpath(path) ]
def fixer(simulate=False, verbose=False): """Remove leading slashes by running the new and improved harmonize/clean_string scipts""" docs = Document.objects.raw(r'''select Document.pk from Document, Citation where Document.citation_id = Citation.pk and Citation.case_name like '(%%';''') for doc in docs: # Special cases if 'Klein' in doc.case_name: continue elif 'in re' in doc.case_name.lower(): continue elif doc.case_name == "(White) v. Gray": doc.case_name = "White v. Gray" if not simulate: doc.save() # Otherwise, we nuke the leading parens. old_case_name = doc.case_name new_case_name = titlecase(harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1)))) if verbose: print "Fixing document %s: %s" % (doc.pk, doc) print " New for %s: %s\n" % (doc.pk, new_case_name) if not simulate: doc.case_name = new_case_name doc.citation.save()
def extract_cases_from_html(self, html): paths = "//p/strong | //p/b | //p/font/strong | //p/font/b" for date_element in html.xpath(paths): string = date_element.xpath("./text()") try: string = string[0] # handle examples where time but no date (ga_example_3.html) if ":" in string and ("AM" in string or "PM" in string): continue # handle legacy example (ga_example.html) string = string.split("SUMMARIES")[0] date_string = re.sub(r"\W+", " ", string) # handle legacy example (ga_example.html) if len(date_string.split()) != 3: continue case_date = convert_date_string(date_string) except: continue parent = date_element.xpath("./..")[0] # handle legacy example (ga_example.html) while parent.tag != "p": parent = parent.xpath("./..")[0] for item in parent.getnext().xpath("./li"): text = item.text_content() if text: split = text.split(".", 1) self.cases.append({ "date": case_date, "url": item.xpath("//a[1]/@href")[0], "docket": split[0].rstrip("."), "name": titlecase(split[1]), })
def fixer(simulate=False, verbose=False): """Remove leading slashes by running the new and improved harmonize/clean_string scipts""" docs = Document.objects.raw(r'''select Document.pk from Document, Citation where Document.citation_id = Citation.pk and Citation.case_name like '(%%';''') for doc in docs: # Special cases if 'Klein' in doc.case_name: continue elif 'in re' in doc.case_name.lower(): continue elif doc.case_name == "(White) v. Gray": doc.case_name = "White v. Gray" if not simulate: doc.save() # Otherwise, we nuke the leading parens. old_case_name = doc.case_name new_case_name = titlecase( harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1)))) if verbose: print "Fixing document %s: %s" % (doc.pk, doc) print " New for %s: %s\n" % (doc.pk, new_case_name) if not simulate: doc.case_name = new_case_name doc.citation.save()
def _get_lower_courts(self): lower_courts = [] for s in self.html.xpath('//li[@class = "casedetailsleft"]//li[@class="lowerCourt"]/text()'): try: lower_courts.append(titlecase(s.strip().split(' ', 2)[2])) except IndexError: lower_courts.append('') return lower_courts
def _get_dispositions(self): docket_numbers = [] for e in self.html.xpath( '//tr/td[@class="DocumentBrowserCell"][8][../*[4]//text()]//nobr' ): s = html.tostring(e, method='text', encoding='unicode') docket_numbers.append(titlecase(s)) return docket_numbers
def _get_case_names(self): case_names = [] for case_name in self.html.xpath('//li[@class="title1" and not(ancestor::ul[@class="result-header"])]/a/text()'): case_name = titlecase(case_name) if 'People of Mi ' in case_name: case_name = case_name.replace('People of Mi ', 'People of Michigan ') case_names.append(case_name) return case_names
def _get_case_names(self): path = "%s/a[contains(@href, 'pdf')]/text()" % self.base_path case_names = [] for s in self.html.xpath(path): case_name = self.extract_regex_group(1, s) if not case_name: case_name = self.extract_regex_group(2, s, True) case_names.append(titlecase(case_name.upper())) return case_names
def _get_lower_courts(self): lower_courts = [] for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="casedetailsleft"]' '//li[@class="lowerCourt"]/text()') lower_courts.append(titlecase(clean_if_py3(s[0]).strip().split(' ', 2)[2])) except IndexError: lower_courts.append('') return lower_courts
def _get_case_names(self): case_names = [] for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'): t = ' '.join(t.split()) # Normalize whitespace if t.strip(): # If there is something other than whitespace... t = t.encode('utf8').split(' • ')[1].strip() t = titlecase(t.lower()) case_names.append(t) return case_names
def _get_lower_courts(self): lower_courts = [] for el in self.html.xpath('//ul[@class="odd" or @class="even"]'): try: s = el.xpath('//li[@class="casedetailsleft"]' '//li[@class="lowerCourt"]/text()') lower_courts.append(titlecase(s[0].strip().split(' ', 2)[2])) except IndexError: lower_courts.append('') return lower_courts
def _get_lower_courts(self): lower_courts = [] for el in self.html.xpath( "//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="casedetailsleft"]' '//li[@class="lowerCourt"]/text()') lower_courts.append( titlecase(clean_if_py3(s[0]).strip().split(" ", 2)[2])) except IndexError: lower_courts.append("") return lower_courts
def fetcher(url): if self.method == 'LOCAL': return "No case names fetched during tests." else: html_tree = self._get_html_tree_by_url(url, self.request_dict) plaintiff = '' defendant = '' try: plaintiff = html_tree.xpath( "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] defendant = html_tree.xpath( "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()" )[0] except IndexError: logger.warn("No title or defendant found for {}".format(url)) if defendant.strip(): # If there's a defendant return titlecase('%s v. %s' % (plaintiff, defendant)) else: return titlecase(plaintiff)
def _get_case_names(self): casenames = [] rowpath = "//table[2]//tr[position()>0]" cnpath = "./td[2]//text()[preceding-sibling::br]" urlpath = "./td[3]/a/@href" for row in self.html.xpath(rowpath): case_list = row.xpath(cnpath) for rough_case_name in case_list: case_name = titlecase(rough_case_name.lower()) # Determine the number of urls in each row and pad the case # name list sufficiently count = len(row.xpath(urlpath)) casenames.extend([case_name] * count) return casenames
def _get_case_names(self): case_names = [] for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'): t = ' '.join(clean_if_py3(t).split()) # Normalize whitespace if t.strip(): # If there is something other than whitespace... if not isinstance(t, six.string_types): t = str(t, encoding='utf-8') if u' • ' in t: t = t.split(u' • ')[1].strip() t = titlecase(t.lower()) case_names.append(t) return case_names
def _get_case_names(self): casenames = [] rowpath = '//table[2]//tr[position()>0]' cnpath = './td[2]//text()[preceding-sibling::br]' urlpath = './td[3]/a/@href' for row in self.html.xpath(rowpath): case_list = row.xpath(cnpath) for rough_case_name in case_list: case_name = titlecase(rough_case_name.lower()) # Determine the number of urls in each row and pad the case # name list sufficiently count = len(row.xpath(urlpath)) casenames.extend([case_name] * count) return casenames
def _process_html(self): paths = "//p/strong | //p/b | //p/font/strong | //p/font/b" for date_element in self.html.xpath(paths): string = date_element.xpath("./text()") try: string = string[0] # handle examples where time but no date (ga_example_3.html) if ":" in string and ("AM" in string or "PM" in string): continue # handle legacy example (ga_example.html) string = string.split("SUMMARIES")[0] date_string = re.sub(r"\W+", " ", string) # handle legacy example (ga_example.html) if len(date_string.split()) != 3: continue except: continue parent = date_element.xpath("./..")[0] # handle legacy example (ga_example.html) while parent.tag != "p": parent = parent.xpath("./..")[0] for item in parent.getnext().xpath("./li"): text = item.text_content() if text: # Extract Docket numbers dockets = re.findall(self.regex_docket, text) if not dockets: raise InsanityException( "Could not find docket numbers in: 's'" % text) # Extract name substring; I am sure this could # be done with a more slick regex, but its not # my forte... name = text for docket in dockets: name = name.replace(docket, "") name = name.lstrip(" .,") self.cases.append({ "date": date_string, "docket": ", ".join(dockets), "name": titlecase(name.lstrip(" .,")), "url": item.xpath(".//a[1]/@href")[0], })
def _get_case_names(self): """ This example demonstrates how to extract text from an element that may contain other elements. For example, this will work well on something like: <strong>Nadim v. <em>Jenny</em></strong> Resulting in text like: Nadim v. Jenny Note that titlecase() should be used here in the case that the case names are provided in uppercase. Use the titlecase function on cases where the name is provided in uppercase only. """ case_names = [] for e in self.html.xpath('//path/to/an/element/p'): s = html.tostring(e, method='text', encoding='unicode') case_names.append(titlecase(s)) return case_names
def fetcher(e): """This reaches out to a secondary system and scrapes the correct info. """ if self.method == 'LOCAL': return "No case names fetched during tests." else: url = 'http://162.114.92.78/dockets/SearchCaseDetail.asp' anchor_text = html.tostring(e, method='text', encoding='unicode') m = self.docket_number_regex.search(anchor_text) r = requests.post( url, headers={'User-Agent': 'Juriscraper'}, data={ 'txtyear': m.group('year'), 'txtcasenumber': m.group('docket_num').strip('0'), 'cmdnamesearh': 'Search', }, ) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) # And finally, we parse out the good stuff. parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()" case_name_parts = [] for s in html_tree.xpath(parties_path): if s.strip(): case_name_parts.append(titlecase(s.strip().lower())) if len(case_name_parts) == 2: break return ' v. '.join(case_name_parts)
def _add_case_names(self, html_tree): case_names = [] for element in self._get_links_from_html(html_tree): text = ' '.join(list(element.xpath(".//text()"))) text = ' '.join(text.split()) if text: try: case_name = re.search(r'(\d+ ?-\w{1,2} ?- ?\d+)(?!.*\d+ ?-\w{1,2} ?- ?\d+)\s*(.*)', text).group(2) if case_name: pass else: name_element = element.xpath("./parent::p[1]/text()") text = ' '.join(name_element.split()) case_name = re.search(r'(\w+.+)+', text).group(1) except AttributeError: case_name = re.search(r'(\w+.+)+', text).group(1) case_names.append(titlecase(case_name)) return case_names
def _add_case_names(html_tree): case_names = [] for element in html_tree.xpath("//p[contains(., 'v.') or contains(., 'IN RE') or " "contains(., 'IN THE') or contains(., 'vs.')]//a"): text = ' '.join(list(element.xpath(".//text()"))) text = ' '.join(text.split()) if text: try: case_name = re.search('(\d+ ?-\w{1,2} ?- ?\d+)(?!.*\d+ ?-\w{1,2} ?- ?\d+)\s*(.*)', text).group(2) if case_name: pass else: name_element = element.xpath("./parent::p[1]/text()") text = ' '.join(name_element.split()) case_name = re.search('(\w+.+)+', text).group(1) except AttributeError: case_name = re.search('(\w+.+)+', text).group(1) case_names.append(titlecase(case_name)) return case_names
def get_case_name(complete_html_tree, case_path): path = "//head/title/text()" # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. Court, ED Pennsylvania 1979' s = complete_html_tree.xpath(path)[0].rsplit("-", 1)[0].rsplit(",", 1)[0] # returns 'In re 221A Holding Corp., Inc.' case_name = harmonize(clean_string(titlecase(s))) if not s: try: case_name = fixes[case_path]["case_name"] except KeyError: if "input_case_names" in DEBUG: if "firefox" in DEBUG: subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate() input_case_name = raw_input(" No case name found. What should be here? ") input_case_name = unicode(input_case_name) add_fix(case_path, {"case_name": input_case_name}) case_name = input_case_name if "case_name" in DEBUG: log_print(" Case name: %s" % case_name) return case_name
def _extract_cases_from_html(self, html): """Build list of data dictionaries, one dictionary per case (table row).""" self.cases = [] for row in html.xpath('//table/tbody/tr'): date, docket, url, name, status = False, False, False, False, False date = convert_date_string(row.xpath('td[1]/span/text()')[0]) docket = row.xpath('td[2]/text()')[0].strip() url_raw = row.xpath('td[4]/a/@href') if url_raw: url = url_raw[0] name_raw = row.xpath('td[4]/a/text()') if name_raw: name = titlecase(name_raw[0].split('[')[0].strip()) status_raw = row.xpath('td[5]/text()') if status_raw: status_raw = status_raw[0].strip().lower() if 'nonprecedential' in status_raw.lower(): status = 'Unpublished' elif 'precedential' in status_raw.lower(): status = 'Published' else: status = 'Unknown' if date and docket and url and name and status: self.cases.append({ 'date': date, 'docket': docket, 'url': url, 'name': name, 'status': status, })
def _get_case_names(self): path = '{base}/td[1]/a/text()'.format(base=self.base) return [titlecase(text) for text in self.html.xpath(path)]
def _get_dispositions(self): path = "{base}/td[4]/text()".format(base=self.base_path) return [titlecase(e) for e in self.html.xpath(path)]
def _get_case_names(self): path = "{base}/td[2]/text()".format(base=self.base_path) return [titlecase(e) for e in self.html.xpath(path)]
def _get_judges(self): path = "//*[contains(concat(' ',@id,' '),' case_panel')]/text()" return [titlecase(s.lower()) for s in self.html.xpath(path)]
def test_titlecase(self): """Tests various inputs for the titlecase function""" test_pairs = [ ["Q&A with steve jobs: 'that's what happens in technology'", u"Q&A With Steve Jobs: 'That's What Happens in Technology'"], ["What is AT&T's problem?", u"What is AT&T's Problem?"], ['Apple deal with AT&T falls through', u'Apple Deal With AT&T Falls Through'], ['this v that', u'This v That'], ['this v. that', u'This v. That'], ['this vs that', u'This vs That'], ['this vs. that', u'This vs. That'], ["The SEC's Apple Probe: What You Need to Know", u"The SEC's Apple Probe: What You Need to Know"], ["'by the Way, small word at the start but within quotes.'", u"'By the Way, Small Word at the Start but Within Quotes.'"], ['Small word at end is nothing to be afraid of', u'Small Word at End is Nothing to Be Afraid Of'], ['Starting Sub-Phrase With a Small Word: a Trick, Perhaps?', u'Starting Sub-Phrase With a Small Word: A Trick, Perhaps?'], ["Sub-Phrase With a Small Word in Quotes: 'a Trick, Perhaps?'", u"Sub-Phrase With a Small Word in Quotes: 'A Trick, Perhaps?'"], ['Sub-Phrase With a Small Word in Quotes: "a Trick, Perhaps?"', u'Sub-Phrase With a Small Word in Quotes: "A Trick, Perhaps?"'], ['"Nothing to Be Afraid of?"', u'"Nothing to Be Afraid Of?"'], ['"Nothing to be Afraid Of?"', u'"Nothing to Be Afraid Of?"'], ['a thing', u'A Thing'], ["2lmc Spool: 'gruber on OmniFocus and vapo(u)rware'", u"2lmc Spool: 'Gruber on OmniFocus and Vapo(u)rware'"], ['this is just an example.com', u'This is Just an example.com'], ['this is something listed on del.icio.us', u'This is Something Listed on del.icio.us'], ['iTunes should be unmolested', u'iTunes Should Be Unmolested'], ['Reading between the lines of steve jobs’s ‘thoughts on music’', # Tests unicode u'Reading Between the Lines of Steve Jobs’s ‘thoughts on Music’'], ['seriously, ‘repair permissions’ is voodoo', # Tests unicode u'Seriously, ‘repair Permissions’ is Voodoo'], [ 'generalissimo francisco franco: still dead; kieren McCarthy: ' 'still a jackass', u'Generalissimo Francisco Franco: Still Dead; Kieren McCarthy:' u' Still a Jackass'], ['Chapman v. u.s. Postal Service', u'Chapman v. U.S. Postal Service'], ['Spread Spectrum Screening Llc. v. Eastman Kodak Co.', u'Spread Spectrum Screening LLC. v. Eastman Kodak Co.'], [ 'Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear ' 'Indian Point 2, Llc.', u'Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear' u' Indian Point 2, LLC.'], ['Infosint s.a. v. H. Lundbeck A/s', u'Infosint S.A. v. H. Lundbeck A/S'], ["KEVIN O'CONNELL v. KELLY HARRINGTON", u"Kevin O'Connell v. Kelly Harrington"], ['International Union of Painter v. J&r Flooring, Inc', u'International Union of Painter v. J&R Flooring, Inc'], [ 'DOROTHY L. BIERY, and JERRAMY and ERIN PANKRATZ v. THE UNITED' ' STATES 07-693L And', u'Dorothy L. Biery, and Jerramy and Erin Pankratz v. the ' u'United States 07-693l And'], ['CARVER v. US', u'Carver v. US']] for pair in test_pairs: self.assertEqual(titlecase(force_unicode(pair[0])), pair[1])
def _get_case_names(self): return [titlecase(t) for t in self.html.xpath('//table/td[2]/text()')]
def format_case_name(n): """Applies standard harmonization methods after normalizing with lowercase.""" return titlecase(harmonize(n.lower()))