Esempio n. 1
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//table[@id = "searchResults"]/tr[position() >= 3]/td[1]/text()'):
         if clean_string(date_string) == '2011-09-00':
             date_string = '2011-09-02'
         dates.append(date.fromtimestamp(time.mktime(time.strptime(clean_string(date_string), '%Y-%m-%d'))))
     return dates
Esempio n. 2
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//table[@id = "searchResults"]/tr[position() >= 3]/td[1]/text()'):
         if clean_string(date_string) == '2011-09-00':
             date_string = '2011-09-02'
         dates.append(date.fromtimestamp(time.mktime(time.strptime(clean_string(date_string), '%Y-%m-%d'))))
     return dates
Esempio n. 3
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//tr[./td[1]/a//text()]/td[5]//text()'):
         s = clean_string(date_string)
         if s == '00-00-0000' and 'begin=21160' in self.url:
             # Bad data found during backscrape.
             s = '12-13-2006'
         dates.append(datetime.strptime(clean_string(s), '%m-%d-%Y').date())
     return dates
Esempio n. 4
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath(
             "//tr[./td[1]/a//text()]/td[5]//text()"):
         s = clean_string(date_string)
         if s == "00-00-0000" and "begin=21160" in self.url:
             # Bad data found during backscrape.
             s = "12-13-2006"
         dates.append(datetime.strptime(clean_string(s), "%m-%d-%Y").date())
     return dates
Esempio n. 5
0
 def _get_case_dates(self):
     dates = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         date_string = self.grouping_regex.search(s).group(3)
         dates.append(convert_date_string(date_string))
     return dates
def get_case_name(complete_html_tree, case_path):
    path = '//head/title/text()'
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist.
    # Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit('-', 1)[0].rsplit(',', 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]['case_name']
        except KeyError:
            if 'input_case_names' in DEBUG:
                if 'firefox' in DEBUG:
                    subprocess.Popen(
                        ['firefox', 'file://%s' % case_path],
                        shell=False).communicate()
                input_case_name = raw_input(
                    '  No case name found. What should be here? ')
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {'case_name': input_case_name})
                case_name = input_case_name

    if 'case_name' in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Esempio n. 7
0
 def _get_case_dates(self):
     dates = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         date_string = self.grouping_regex.search(s).group(3)
         dates.append(convert_date_string(date_string))
     return dates
Esempio n. 8
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath('./creator')[0].text_content()
            pubdate = item.xpath('./pubdate')[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath('./title')[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath('.//@href')[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append({
                'name': name,
                'date': convert_date_string(pubdate_sanitized),
                'docket': docket,
                'judge': self.sanitize_text(creator),
                'url': url,
            })
Esempio n. 9
0
    def _process_html(self):
        for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'):
            text = clean_string(item.text_content())
            text_parts = text.split('-', 1)

            if len(text_parts) != 2:
                raise InsanityException('Unexpected text format: "%s"' % text)

            # sometimes the records include a docket number(s) as the
            # first words in the second half of the hyphenated string,
            # but some don't include a docket at all.  So we test to see
            # if the first word is numeric (minus the slash characters
            # used to conjoin multiple docket numbers).
            docket_name = text_parts[1].split(None, 1)
            first_word = docket_name[0].replace('/', '')
            if first_word.isnumeric():
                docket = docket_name[0]
                name = docket_name[1]
            else:
                docket = ''
                name = text_parts[1]

            self.cases.append({
                'date': text_parts[0],
                'docket': docket,
                'name': name,
                'url': item.xpath('.//a/@href')[0],
            })
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '(%%';''')

    for doc in docs:
        # Special cases
        if 'Klein' in doc.case_name:
            continue
        elif 'in re' in doc.case_name.lower():
            continue
        elif doc.case_name == "(White) v. Gray":
            doc.case_name = "White v. Gray"
            if not simulate:
                doc.save()


        # Otherwise, we nuke the leading parens.
        old_case_name = doc.case_name
        new_case_name = titlecase(harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1))))

        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)
            print "        New for %s: %s\n" % (doc.pk, new_case_name)

        if not simulate:
            doc.case_name = new_case_name
            doc.citation.save()
Esempio n. 11
0
    def _process_html(self):
        for item in self.html.xpath('//li[contains(.//a/@href, ".pdf")]'):
            text = clean_string(item.text_content())
            date_string = " ".join(text.split()[0:3])
            try:
                convert_date_string(date_string)
            except:
                raise InsanityException('Unexpected text format: "%s"' % text)
            docket_name = text.replace(date_string, "").strip().lstrip("-")

            # sometimes the records include a docket number(s) as the
            # first words in the second half of the hyphenated string,
            # but some don't include a docket at all.  So we test to see
            # if the first word is numeric (minus the slash characters
            # used to conjoin multiple docket numbers).
            docket, name = docket_name.split(None, 1)
            first_word = docket[0].replace("/", "")
            if not first_word.isnumeric():
                docket = ""
                name = docket_name

            self.cases.append(
                {
                    "date": date_string,
                    "docket": docket,
                    "name": name,
                    "url": item.xpath(".//a/@href")[0],
                }
            )
Esempio n. 12
0
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '(%%';''')

    for doc in docs:
        # Special cases
        if 'Klein' in doc.case_name:
            continue
        elif 'in re' in doc.case_name.lower():
            continue
        elif doc.case_name == "(White) v. Gray":
            doc.case_name = "White v. Gray"
            if not simulate:
                doc.save()

        # Otherwise, we nuke the leading parens.
        old_case_name = doc.case_name
        new_case_name = titlecase(
            harmonize(clean_string(re.sub('\(.*?\)', '', doc.case_name, 1))))

        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)
            print "        New for %s: %s\n" % (doc.pk, new_case_name)

        if not simulate:
            doc.case_name = new_case_name
            doc.citation.save()
Esempio n. 13
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath("./creator")[0].text_content()
            pubdate = item.xpath("./pubdate")[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath("./title")[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath(".//@href")[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append(
                {
                    "name": name,
                    "date": convert_date_string(pubdate_sanitized),
                    "docket": docket,
                    "judge": self.sanitize_text(creator),
                    "url": url,
                }
            )
Esempio n. 14
0
 def _get_case_names(self):
     case_names = []
     path = '//*[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul//text()'
     for s in self.html.xpath(path):
         if '-' in s:
             case_names.append(clean_string(s))
     return case_names
Esempio n. 15
0
 def _get_case_names(self):
     case_names = []
     path = '//*[@id="AutoNumber1"]/tr[2]/td/table/tr/td//ul//text()'
     for s in self.html.xpath(path):
         if '-' in s:
             case_names.append(clean_string(s))
     return case_names
Esempio n. 16
0
    def _get_summaries(self):
        path_to_all_paras = '//div[@id="opinion"]/p'
        summaries = []
        summary_parts = ''
        for elem in self.html.xpath(path_to_all_paras):
            # Check if it's a title paragraph
            if elem.xpath('./a/b//text()'):
                # If so, append previous values and start a new summary item.
                if summary_parts:
                    summaries.append(summary_parts)
                summary_parts = ''
                continue
            # Check if it has a descendant with font[@size="2"].
            elif elem.xpath('./font[@size="2"]'):
                # If so, then it's a summary paragraph.
                summary_parts += '<p>%s</p>\n' % clean_string(html.tostring(elem, method='text', encoding='unicode'))
            else:
                # Something else...
                continue

        # Append the tailing summary
        if summary_parts:
            # On days with no content, this winds up blank and shouldn't be appended.
            summaries.append(summary_parts)
        return summaries
Esempio n. 17
0
def get_judge(html, case_path=None):
    path = "//p[position() <= 60]//text()[not(parent::span)][not(ancestor::center)][not(ancestor::i)]"
    text_elements = html.xpath(path)

    # Get the first paragraph that starts with two uppercase letters after we've stripped out any star pagination.
    judge = None
    for t in text_elements:
        t = clean_string(t)
        judge, reason = get_judge_from_str(t)
        if judge:
            break
        if reason == "TOO_LONG":
            # We've begun doing paragraphs...
            break

    if not judge:
        try:
            judge = fixes[case_path]["judge"]
        except KeyError:
            if "input_judge" in DEBUG:
                subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                judge = raw_input("No judge identified! What should be here? ")
                add_fix(case_path, {"judge": judge})
            if "log_bad_judges" in DEBUG:
                with open("missing_judges.txt", "a") as out:
                    out.write("%s\n" % case_path)

    if "judge" in DEBUG:
        log_print("  Judge: %s" % judge)

    return judge
Esempio n. 18
0
    def _extract_case_data_from_html(self, html):
        for item in html.xpath(self.base):
            creator = item.xpath('./creator')[0].text_content()
            pubdate = item.xpath('./pubdate')[0].text_content()
            pubdate_sanitized = self.sanitize_text(pubdate)
            title = item.xpath('./title')[0].text_content()
            title_sanitized = self.sanitize_text(title)
            title_clean = clean_string(title_sanitized)
            search = self.regex.search(title_clean)
            url = item.xpath('.//@href')[0]

            if search:
                name = search.group(1)
                docket = search.group(2)
            else:
                name = title_clean
                docket = self._extract_docket_from_url(url)

            self.cases.append({
                'name': name,
                'date': convert_date_string(pubdate_sanitized),
                'docket': docket,
                'judge': self.sanitize_text(creator),
                'url': url,
            })
Esempio n. 19
0
def get_judge(html, case_path=None):
    path = '//p[position() <= 60]//text()[not(parent::span)][not(ancestor::center)][not(ancestor::i)]'
    text_elements = html.xpath(path)

    # Get the first paragraph that starts with two uppercase letters after we've stripped out any star pagination.
    judge = None
    for t in text_elements:
        t = clean_string(t)
        judge, reason = get_judge_from_str(t)
        if judge:
            break
        if reason == 'TOO_LONG':
            # We've begun doing paragraphs...
            break

    if not judge:
        try:
            judge = fixes[case_path]['judge']
        except KeyError:
            if 'input_judge' in DEBUG:
                subprocess.Popen(
                    ['firefox', 'file://%s' % case_path],
                    shell=False).communicate()
                judge = raw_input("No judge identified! What should be here? ")
                add_fix(case_path, {'judge': judge})
            if 'log_bad_judges' in DEBUG:
                with open('missing_judges.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if 'judge' in DEBUG:
        log_print('  Judge: %s' % judge)

    return judge
Esempio n. 20
0
 def _get_case_names(self):
     names = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         name_raw = self.grouping_regex.search(s).group(1)
         name = name_raw.replace(';', ' / ')
         names.append(name)
     return names
Esempio n. 21
0
 def _get_case_names(self):
     names = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         name_raw = self.grouping_regex.search(s).group(1)
         name = name_raw.replace(';', ' / ')
         names.append(name)
     return names
Esempio n. 22
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//table/tbody/tr/td[1]/text()'):
         dates.append(
             date.fromtimestamp(
                 time.mktime(
                     time.strptime(clean_string(date_string), '%m/%d/%Y'))))
     return dates
Esempio n. 23
0
 def _get_docket_numbers(self):
     dockets = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         docket_raw = self.grouping_regex.search(s).group(2)
         docket = docket_raw.replace(';', ',').replace(self.court_identifier + ',', self.court_identifier)
         dockets.append(docket)
     return dockets
Esempio n. 24
0
 def _get_docket_numbers(self):
     dockets = []
     for s in self.html.xpath(self.base_path):
         s = clean_string(s)
         docket_raw = self.grouping_regex.search(s).group(2)
         docket = docket_raw.replace(';', ',').replace(
             self.court_identifier + ',', self.court_identifier)
         dockets.append(docket)
     return dockets
Esempio n. 25
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath("//form/table[1]/tr//td[1]/text()"):
         dates.append(
             date.fromtimestamp(
                 time.mktime(
                     time.strptime(clean_string(date_string), "%b %d, %Y")
                 )
             )
         )
     return dates
Esempio n. 26
0
    def _get_anchor_docket_name_pairs(self):
        """The court has some ugly HTML practices that we need to handle.

        Most anchor links include single line strings with a single docket
        number and a single case name.  However, there are two other formats
        we've seen and must work around.

        (CASE 1)
        The anchor has multiple lines broken with <br> tag(s), and each
        line contains "<docket> <name>". In this case we need to combine
        the docket numbers and name strings respectively.
        [EXAMPLE: February 18, 2016 in nh_example_2.html]

        (CASE 2)
        The anchor has multiple lines broken with <br> tag(s), and the
        second line is a continuation of a long case name started on the first
        line.  So, the second line does not lead with a docket number, thus
        this line's string should be glued onto the <name> substring extracted
        from the previous line.
        [EXAMPLE: September 18, 2018 in nh_example_6.html]
        """
        pairs = []
        for anchor in self.html.xpath(self.link_path):
            i = 0
            dockets = []
            name_substrings = []
            text_anchor = anchor.text_content()
            text_clean = text_anchor.replace("\n", "")

            for text in text_clean.split(";"):
                text = clean_string(text)
                match = self.link_text_regex.search(text)
                try:
                    docket = match.group(1)
                    dockets.append(docket)
                    name = match.group(2)
                    name = " ".join(name.split())
                    name_substrings.append(name)
                    i += 1
                except AttributeError:
                    if i == 0:
                        # docket and name (root) should be contained in first substring
                        error = "Invalid anchor root string format: %s" % text
                        raise InsanityException(error)
                    # no docket in the substring, its a trailing name substring
                    # that they broke over multiple lines, so glue it to the
                    # previous name substring
                    name_substrings[i - 1] += " %s" % text
            pairs.append({
                "docket": ", ".join(dockets),
                "name": " and ".join(name_substrings),
            })
        return pairs
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r"""select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '/%%';""")
    for doc in docs:
        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)

        if not simulate:
            doc.case_name = harmonize(clean_string(doc.case_name))
            doc.citation.save()
def fixer(simulate=False, verbose=False):
    """Remove leading slashes by running the new and improved harmonize/clean_string scipts"""
    docs = Document.objects.raw(r'''select Document.pk
                                    from Document, Citation
                                    where Document.citation_id = Citation.pk and
                                    Citation.case_name like '/%%';''')
    for doc in docs:
        if verbose:
            print "Fixing document %s: %s" % (doc.pk, doc)

        if not simulate:
            doc.citation.case_name = harmonize(clean_string(doc.citation.case_name))
            doc.citation.save()
Esempio n. 29
0
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath(self.base_path):
         s = html.tostring(e, method='text', encoding='unicode')
         s = re.search('(.*[0-9]{4})', s).group(1)
         date_formats = ['%B %d, %Y', '%B %d %Y']
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(
                     time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
Esempio n. 30
0
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath('//div[@id = "block-system-main"]//div[contains(concat(" ", @class, " "), " field-items ")]//li'):
         s = html.tostring(e, method='text', encoding='unicode')
         s = re.search('(.*[0-9]{4})', s).group(1)
         date_formats = ['%B %d, %Y',
                         '%B %d %Y']
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
def parse_dates(raw_dates):
    """Parses the dates from a list of string.

    Returns a list of lists of (string, datetime) tuples if there is a string
    before the date (or None).

    :param raw_dates: A list of (probably) date-containing strings
    """
    months = re.compile("january|february|march|april|may|june|july|august|"
                        "september|october|november|december")
    dates = []
    for raw_date in raw_dates:
        # there can be multiple years in a string, so we split on possible
        # indicators
        raw_parts = re.split('(?<=[0-9][0-9][0-9][0-9])(\s|.)', raw_date)

        # index over split line and add dates
        inner_dates = []
        for raw_part in raw_parts:
            # consider any string without either a month or year not a date
            no_month = False
            if re.search(months, raw_part.lower()) is None:
                no_month = True
                if re.search('[0-9][0-9][0-9][0-9]', raw_part) is None:
                    continue
            # strip parenthesis from the raw string (this messes with the date
            # parser)
            raw_part = raw_part.replace('(', '').replace(')', '')
            # try to grab a date from the string using an intelligent library
            try:
                d = dparser.parse(raw_part, fuzzy=True).date()
            except:
                continue

            # split on either the month or the first number (e.g. for a
            # 1/1/2016 date) to get the text before it
            if no_month:
                text = re.compile('(\d+)').split(raw_part.lower())[0].strip()
            else:
                text = months.split(raw_part.lower())[0].strip()
            # remove footnotes and non-alphanumeric characters
            text = re.sub('(\[fn.?\])', '', text)
            text = re.sub('[^A-Za-z ]', '', text).strip()
            # if we ended up getting some text, add it, else ignore it
            if text:
                inner_dates.append((clean_string(text), d))
            else:
                inner_dates.append((None, d))
        dates.append(inner_dates)
    return dates
Esempio n. 32
0
def parse_dates(raw_dates):
    """Parses the dates from a list of string.

    Returns a list of lists of (string, datetime) tuples if there is a string
    before the date (or None).

    :param raw_dates: A list of (probably) date-containing strings
    """
    months = re.compile("january|february|march|april|may|june|july|august|"
                        "september|october|november|december")
    dates = []
    for raw_date in raw_dates:
        # there can be multiple years in a string, so we split on possible
        # indicators
        raw_parts = re.split('(?<=[0-9][0-9][0-9][0-9])(\s|.)', raw_date)

        # index over split line and add dates
        inner_dates = []
        for raw_part in raw_parts:
            # consider any string without either a month or year not a date
            no_month = False
            if re.search(months, raw_part.lower()) is None:
                no_month = True
                if re.search('[0-9][0-9][0-9][0-9]', raw_part) is None:
                    continue
            # strip parenthesis from the raw string (this messes with the date
            # parser)
            raw_part = raw_part.replace('(', '').replace(')', '')
            # try to grab a date from the string using an intelligent library
            try:
                d = dparser.parse(raw_part, fuzzy=True).date()
            except:
                continue

            # split on either the month or the first number (e.g. for a
            # 1/1/2016 date) to get the text before it
            if no_month:
                text = re.compile('(\d+)').split(raw_part.lower())[0].strip()
            else:
                text = months.split(raw_part.lower())[0].strip()
            # remove footnotes and non-alphanumeric characters
            text = re.sub('(\[fn.?\])', '', text)
            text = re.sub('[^A-Za-z ]', '', text).strip()
            # if we ended up getting some text, add it, else ignore it
            if text:
                inner_dates.append((clean_string(text), d))
            else:
                inner_dates.append((None, d))
        dates.append(inner_dates)
    return dates
Esempio n. 33
0
 def test_normalize_phrase(self):
     """Tests normalization of case titles."""
     test_pairs = [
         ["Commissioner v. Palin", "palin"],
         ["Commr v. Palin", "palin"],
         ["Comm'r v. Palin", "palin"],
         [
             "United States v. Learned Hand et. al.",
             "unitedstateslearnedhand",
         ],
         ["Baker, Plaintiff v. Palin, Defendant", "bakerpalin"],
     ]
     for pair in test_pairs:
         self.assertEqual(
             normalize_phrase(harmonize(clean_string(pair[0]))), pair[1])
Esempio n. 34
0
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath(self.base_path):
         s = html.tostring(e, method='text', encoding='unicode')
         s = re.search('(.*[0-9]{4})', s).group(1)
         date_formats = ['%B %d, %Y',
                         '%B %d %Y']
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(
                     time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == 'download_urls':
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, basestring):
                         sub_item = clean_string(sub_item)
                     if attr == 'case_names':
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Esempio n. 36
0
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == "download_urls":
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, basestring):
                         sub_item = clean_string(sub_item)
                     if attr in ["case_names", "docket_numbers"]:
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Esempio n. 37
0
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath('//li[@class="MsoNormal"]/span/a[1]'):
         s = html.tostring(e, method="text", encoding="unicode")
         # Cleanup...
         s = s.split("-")[0]
         s = s.split("–")[0]
         date_formats = ["%B %d, %Y", "%B %d %Y", "%B %d , %Y"]
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(
                     time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
Esempio n. 38
0
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath(
             '//div[@id = "block-system-main"]//div[contains(concat(" ", @class, " "), " field-items ")]//li'
     ):
         s = html.tostring(e, method='text', encoding='unicode')
         s = re.search('(.*[0-9]{4})', s).group(1)
         date_formats = ['%B %d, %Y', '%B %d %Y']
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(
                     time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
 def _get_case_dates(self):
     case_dates = []
     for e in self.html.xpath('//li[@class="MsoNormal"]/span/a[1]'):
         s = html.tostring(e, method='text', encoding='unicode')
         # Cleanup...
         s = s.split(u'-')[0]
         s = s.split(u'–')[0]
         date_formats = ['%B %d, %Y',
                         '%B %d %Y',
                         '%B %d , %Y']
         for format in date_formats:
             try:
                 case_date = date.fromtimestamp(time.mktime(time.strptime(clean_string(s), format)))
             except ValueError:
                 continue
         case_dates.append(case_date)
     return case_dates
Esempio n. 40
0
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == 'download_urls':
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, six.string_types):
                         sub_item = clean_string(sub_item)
                     elif isinstance(sub_item, datetime):
                         sub_item = sub_item.date()
                     if attr in ['case_names', 'docket_numbers']:
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Esempio n. 41
0
 def _clean_attributes(self):
     """Iterate over attribute values and clean them"""
     for attr in self._all_attrs:
         item = getattr(self, attr)
         if item is not None:
             cleaned_item = []
             for sub_item in item:
                 if attr == "download_urls":
                     sub_item = sub_item.strip()
                 else:
                     if isinstance(sub_item, six.string_types):
                         sub_item = clean_string(sub_item)
                     elif isinstance(sub_item, datetime):
                         sub_item = sub_item.date()
                     if attr in ["case_names", "docket_numbers"]:
                         sub_item = harmonize(sub_item)
                 cleaned_item.append(sub_item)
             self.__setattr__(attr, cleaned_item)
Esempio n. 42
0
def get_case_name(complete_html_tree, case_path):
    path = "//head/title/text()"
    # Text looks like: 'In re 221A Holding Corp., Inc, 1 BR 506 - Dist. Court, ED Pennsylvania 1979'
    s = complete_html_tree.xpath(path)[0].rsplit("-", 1)[0].rsplit(",", 1)[0]
    # returns 'In re 221A Holding Corp., Inc.'
    case_name = harmonize(clean_string(titlecase(s)))
    if not s:
        try:
            case_name = fixes[case_path]["case_name"]
        except KeyError:
            if "input_case_names" in DEBUG:
                if "firefox" in DEBUG:
                    subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                input_case_name = raw_input("  No case name found. What should be here? ")
                input_case_name = unicode(input_case_name)
                add_fix(case_path, {"case_name": input_case_name})
                case_name = input_case_name

    if "case_name" in DEBUG:
        log_print("  Case name: %s" % case_name)
    return case_name
Esempio n. 43
0
def make_line_to_dict(row):
    columns = row.split('\t')
    item = {
        'court_code': columns[0],
        'docket_number': columns[1],
        'case_name': columns[2],
        'url': columns[3],
        'size': columns[4],
        'counsel': columns[5],
        'issues': columns[6],
        'judges': columns[7],
        'date_argued': datetime.strptime(columns[8], '%Y-%m-%d').date(),
        'orig_url': columns[9],
    }

    for key, value in item.iteritems():
        if key == 'url':
            item['url'] = value.strip()
        else:
            if isinstance(value, basestring):
                item[key] = clean_string(value)
                if key in ['case_name', 'docket_number']:
                    item[key] = harmonize(value)
    return item
Esempio n. 44
0
def make_line_to_dict(row):
    columns = row.split('\t')
    item = {
        'court_code':    columns[0],
        'docket_number': columns[1],
        'case_name':     columns[2],
        'url':           columns[3],
        'size':          columns[4],
        'counsel':       columns[5],
        'issues':        columns[6],
        'judges':        columns[7],
        'date_argued':   datetime.strptime(columns[8], '%Y-%m-%d').date(),
        'orig_url':      columns[9],
    }

    for key, value in item.iteritems():
        if key == 'url':
            item['url'] = value.strip()
        else:
            if isinstance(value, basestring):
                item[key] = clean_string(value)
                if key in ['case_name', 'docket_number']:
                    item[key] = harmonize(value)
    return item
Esempio n. 45
0
 def sanitize_text(self, text):
     text = clean_string(text)
     return text.replace(r'\n', '\n').replace(u'–', '-')
Esempio n. 46
0
def get_court_object(html, citations=None, case_path=None, judge=None):
    """
       Parse out the court string, somehow, and then map it back to our internal ids
    """

    def string_to_key(str):
        """Given a string, tries to map it to a court key."""
        # State
        for regex, value in state_pairs:
            if re.search(regex, str):
                return value

        # Supreme Court
        if re.search("Supreme Court of (the )?United States", str) or re.search("United States Supreme Court", str):
            return "scotus"

        # Federal appeals
        if re.search("Court,? of Appeal", str) or "Circuit of Appeals" in str:
            if "First Circuit" in str or "First District" in str:
                return "ca1"
            elif "Second Circuit" in str or "Second District" in str:
                return "ca2"
            elif "Third Circuit" in str:
                return "ca3"
            elif "Fourth Circuit" in str:
                return "ca4"
            elif "Fifth Circuit" in str:
                return "ca5"
            elif "Sixth Circuit" in str:
                return "ca6"
            elif "Seventh Circuit" in str:
                return "ca7"
            elif "Eighth" in str:  # Aka, apparently, "Eighth Court"
                return "ca8"
            elif re.search("Ninth (Judicial )?Circuit", str):
                return "ca9"
            elif "Tenth Circuit" in str:
                return "ca10"
            elif "Eleventh Circuit" in str:
                return "ca11"
            elif "District of Columbia" in str:
                return "cadc"
            elif "Federal Circuit" in str:
                return "cafc"
            elif "Emergency" in str:
                return "eca"
            elif "Columbia" in str:
                return "cadc"
        elif "Judicial Council of the Eighth Circuit" in str:
            return "ca8"
        elif "Judicial Council of the Ninth Circuit" in str or re.search("Ninth Judicial Circuit", str):
            return "ca9"

        # Federal district
        elif re.search("(^| )Distr?in?ct", str, re.I):
            for regex, value in fd_pairs:
                if re.search(regex, str):
                    return value
        elif "D. Virgin Islands" in str:
            return "vid"
        elif "Territorial Court" in str:
            if "Virgin Islands" in str:
                return "vid"

        # Federal special
        elif "United States Judicial Conference Committee" in str or "U.S. Judicial Conference Committee" in str:
            return "usjc"
        elif re.search("Judicial Panel ((on)|(of)) Multidistrict Litigation", str, re.I):
            return "jpml"
        elif "Court of Customs and Patent Appeals" in str:
            return "ccpa"
        elif "Court of Claims" in str or "Claims Court" in str:
            return "cc"  # Cannot change
        elif "United States Foreign Intelligence Surveillance Court" in str:
            return "fiscr"  # Cannot change
        elif re.search("Court,? of,? International ?Trade", str):
            return "cit"
        elif "United States Customs Court" in str:
            return "cusc"  # Cannot change?
        elif re.search("Special Court(\.|,)? Regional Rail Reorganization Act", str):
            return "reglrailreorgct"
        elif re.search("Military Commission Review", str):
            return "mc"

        # Bankruptcy Courts
        elif re.search("bankrup?tcy", str, re.I):
            # Bankruptcy Appellate Panels
            if re.search("Appellan?te Panel", str, re.I):
                if "First Circuit" in str:
                    return "bap1"
                elif "Second Circuit" in str:
                    return "bap2"
                elif "Sixth Circuit" in str:
                    return "bap6"
                elif "Eighth Circuit" in str:
                    return "bap8"
                elif "Ninth Circuit" in str:
                    return "bap9"
                elif "Tenth Circuit" in str:
                    return "bap10"
                elif "Maine" in str:
                    return "bapme"
                elif "Massachusetts" in str:
                    return "bapma"

            # Bankruptcy District Courts
            else:
                for regex, value in fb_pairs:
                    if re.search(regex, str):
                        return value
        else:
            return False

    path = "//center/p/b/text()"
    text_elements = html.xpath(path)
    court = None

    # 1: try using the citations as a clue (necessary first because calctapp calls itself simply, "Court of Appeal,
    # Second District")
    if citations:
        reporter_keys = [citation.canonical_reporter for citation in citations]
        if "Cal. Rptr." in reporter_keys or "Cal. App." in reporter_keys:
            # It's a california court, but which?
            for text_element in text_elements:
                text_element = clean_string(text_element).strip(".")
                if re.search("court of appeal", text_element, re.I):
                    court = "calctapp"
                else:
                    court = "cal"
        elif "U.S." in reporter_keys:
            court = "scotus"

    # 2: Try using a bunch of regular expressions (this catches 95% of items)
    if not court:
        for text_element in text_elements:
            text_element = clean_string(text_element).strip(".")
            court = string_to_key(text_element)
            if court:
                break

    # 3: try the text elements joined together (works if there were line break problems)
    if not court:
        t = clean_string(" ".join(text_elements)).strip(".")
        court = string_to_key(t)

    # 4: Disambiguate by judge
    if not court and judge:
        court = disambiguate_by_judge(judge)
        if court and "log_judge_disambiguations" in DEBUG:
            with open("disambiguated_by_judge.txt", "a") as f:
                f.write("%s\t%s\t%s\n" % (case_path, court, judge.encode("ISO-8859-1")))

    # 5: give up.
    if not court:
        try:
            court = fixes[case_path]["court"]
        except KeyError:
            if "input_court" in DEBUG:
                if "firefox" in DEBUG:
                    subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                court = raw_input("No court identified! What should be here? ")
                add_fix(case_path, {"court": input})
            if "log_bad_courts" in DEBUG:
                # Write the failed case out to file.
                court = "test"
                with open("missing_courts.txt", "a") as out:
                    out.write("%s\n" % case_path)

    if "court" in DEBUG:
        log_print("  Court: %s" % court)

    return court
Esempio n. 47
0
def get_docket_number(html, case_path=None, court=None):
    try:
        path = "//center/text()"
        text_elements = html.xpath(path)
    except AttributeError:
        # Not an HTML element, instead it's a string
        text_elements = [html]
    docket_no_formats = [
        "Bankruptcy",
        "C.A.",
        "Case",
        "Civ",
        "Civil",
        "Civil Action",
        "Crim",
        "Criminal Action",
        "Docket",
        "Misc",
        "Record",
    ]
    regexes = [
        re.compile("((%s)( Nos?\.)?)|(Nos?(\.| )?)" % "|".join(map(re.escape, docket_no_formats)), re.IGNORECASE),
        re.compile("\d{2}-\d{2,5}"),  # WY-03-071, 01-21574
        re.compile("[A-Z]{2}-[A-Z]{2}"),  # CA-CR 5158
        re.compile("[A-Z]\d{2} \d{4}[A-Z]"),  # C86 1392M
        re.compile("\d{2} [A-Z] \d{4}"),  # 88 C 4330
        re.compile("[A-Z]-\d{2,4}"),  # M-47B (VLB), S-5408
        re.compile("[A-Z]\d{3,}"),
        re.compile("[A-Z]{4,}"),  # SCBD #4983
        re.compile("\d{5,}"),  # 95816
        re.compile("\d{2},\d{3}"),  # 86,782
        re.compile("([A-Z]\.){4}"),  # S.C.B.D. 3020
        re.compile("\d{2}-[a-z]{2}-\d{4}"),
    ]

    docket_number = None
    outer_break = False
    for t in text_elements:
        if outer_break:
            # Allows breaking the outer loop from the inner loop
            break
        t = clean_string(t).strip(".")
        for regex in regexes:
            if re.search(regex, t):
                docket_number = t
                outer_break = True
                break

    if docket_number:
        if docket_number.startswith("No."):
            docket_number = docket_number[4:]
        elif docket_number.startswith("Nos."):
            docket_number = docket_number[5:]
        elif docket_number.startswith("Docket No."):
            docket_number = docket_number[11:]
        if re.search("^\(.*\)$", docket_number):
            # Starts and ends with parens. Nuke 'em.
            docket_number = docket_number[1:-1]

    if docket_number and re.search("submitted|reversed", docket_number, re.I):
        # False positive. Happens when there's no docket number and the date is incorrectly interpreted.
        docket_number = None
    elif docket_number == "Not in Source":
        docket_number = None

    if not docket_number:
        try:
            docket_number = fixes[case_path]["docket_number"]
        except KeyError:
            if (
                "northeastern" not in case_path
                and "federal_reporter/2d" not in case_path
                and court not in ["or", "orctapp", "cal"]
                and ("unsorted" not in case_path and court not in ["ind"])
                and ("pacific_reporter/2d" not in case_path and court not in ["calctapp"])
            ):
                # Lots of missing docket numbers here.
                if "input_docket_number" in DEBUG:
                    subprocess.Popen(["firefox", "file://%s" % case_path], shell=False).communicate()
                    docket_number = raw_input("  No docket number found. What should be here? ")
                    add_fix(case_path, {"docket_number": docket_number})
                if "log_bad_docket_numbers" in DEBUG:
                    with open("missing_docket_numbers.txt", "a") as out:
                        out.write("%s\n" % case_path)

    if "docket_number" in DEBUG:
        log_print("  Docket Number: %s" % docket_number)
    return docket_number
Esempio n. 48
0
    def test_harmonize_and_clean_string_tests(self):
        """Tests various inputs for the clean_string and harmonize functions"""
        test_pairs = [
            # Et al
            ['Lissner, et. al.',
             u'Lissner'],
            ['Lissner, et. al',
             u'Lissner'],
            ['Lissner, et al.',
             u'Lissner'],
            ['Lissner, et al',
             u'Lissner'],
            ['Lissner et. al.',
             u'Lissner'],
            ['Lissner et. al',
             u'Lissner'],
            ['Lissner et al.',
             u'Lissner'],
            ['Lissner et al',
             u'Lissner'],

            # US --> United States
            ['US v. Lissner, Plaintiff',
             u'United States v. Lissner'],
            ['US v. Lissner, Petitioner-appellant',
             u'United States v. Lissner'],
            ['United States, Petitioner, v. Lissner',
             u'United States v. Lissner'],
            [
                'United States of America, Plaintiff-Appellee, v. Orlando B. '
                'Pino, Defendant-Appellant, Joseph',
                u'United States v. Orlando B. Pino, Joseph'],
            ['Herring v. U.S. **',
             u'Herring v. United States'],
            ['Test v. U.S',
             u'Test v. United States'],
            ['The United States v. Lissner',
             u'United States v. Lissner'],
            # Tests the output from a titlecased word containing
            # US to ensure it gets harmonized.
            ['Carver v. US',
             u'Carver v. United States'],
            # US Steel --> US Steel
            ['US Steel v.  US',
             u'US Steel v. United States'],
            ['US v. V.Vivack',
             u'United States v. V.Vivack'],
            ['US vs. Lissner',
             u'United States v. Lissner'],
            ['[email protected] vs. USA',
             u'[email protected] v. United States'],
            ['US v. US',
             u'United States v. United States'],
            ['US  Steel v.  US',
             u'US Steel v. United States'],
            ['U.S.A. v. Mr. v.',
             u'United States v. Mr. v.'],
            ['U.S.S. v. Lissner',
             u'U.S.S. v. Lissner'],
            ['USC v. Lissner',
             u'USC v. Lissner'],
            ['U.S.C. v. Lissner',
             u'U.S.C. v. Lissner'],
            ['U.S. Steel v. Colgate',
             u'U.S. Steel v. Colgate'],
            ['U.S.A. v. Lissner',
             u'United States v. Lissner'],
            ['U.S. v. Lissner',
             u'United States v. Lissner'],
            ['U. S. v. Lissner',
             u'United States v. Lissner'],
            ['United States v. Lissner',
             u'United States v. Lissner'],
            ['Usa v. Lissner',
             u'United States v. Lissner'],
            ['USA v. Lissner',
             u'United States v. Lissner'],
            ['United States of America v. Lissner',
             u'United States v. Lissner'],
            ['Lissner v. United States of America',
             u'Lissner v. United States'],

            # tests no period in v.
            ['USA v White',
             u'United States v. White'],
            # tests no period in vs.
            ['USA vs White',
             u'United States v. White'],
            ['V.Vivack and Associates v. US',
             u'V.Vivack and Associates v. United States'],
            ['v.v. Hendricks & Sons v. James v. Smith',
             u'v.v. Hendricks & Sons v. James v. Smith'],

            # Normalize "The State"
            ['Aimee v. The State',
             u'Aimee v. State'],

            # Nuke Pet (short for petitioners)
            ['Commonwealth v. Mickle, V., Pet.',
             u'Commonwealth v. Mickle v.'],
            # Unchanged, despite having the word Pet
            ['Pet Doctors inc. v. Spoon',
             u'Pet Doctors inc. v. Spoon'],

            # Nukes the No. and Nos., but not
            ['No. 23423',
             u'23423'],
            ['Nos. 23 and 232',
             u'23 and 232'],
            ['No Expletives Inc.',
             u'No Expletives Inc.'],
            # Tests that "Nothing" doesn't get nuked.
            ['No. 232 Nothing 232',
             '232 Nothing 232'],

            # Garbage
            # leading slash.
            ['/USA vs White',
             u'United States v. White'],
            # unicode input
            ['12–1438-cr',
             u'12–1438-cr'],

            # Randoms
            ['clarinet alibi',
             u'clarinet alibi'],
            ['papusa',
             u'papusa'],
            ['CUSANO',
             u'CUSANO'],

             # Filter out invalid XML characters
             [u'Special Counsel ex rel. Karla Saunders',
              u'Special Counsel ex rel. Karla Saunders'],
        ]
        for pair in test_pairs:
            self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Esempio n. 49
0
def get_docket_number(html, case_path=None, court=None):
    try:
        path = '//center/text()'
        text_elements = html.xpath(path)
    except AttributeError:
        # Not an HTML element, instead it's a string
        text_elements = [html]
    docket_no_formats = ['Bankruptcy', 'C.A.', 'Case', 'Civ', 'Civil', 'Civil Action', 'Crim', 'Criminal Action',
                         'Docket', 'Misc', 'Record']
    regexes = [
        re.compile('((%s)( Nos?\.)?)|(Nos?(\.| )?)' % "|".join(map(re.escape, docket_no_formats)), re.IGNORECASE),
        re.compile('\d{2}-\d{2,5}'),          # WY-03-071, 01-21574
        re.compile('[A-Z]{2}-[A-Z]{2}'),      # CA-CR 5158
        re.compile('[A-Z]\d{2} \d{4}[A-Z]'),  # C86 1392M
        re.compile('\d{2} [A-Z] \d{4}'),      # 88 C 4330
        re.compile('[A-Z]-\d{2,4}'),          # M-47B (VLB), S-5408
        re.compile('[A-Z]\d{3,}',),
        re.compile('[A-Z]{4,}'),              # SCBD #4983
        re.compile('\d{5,}'),                 # 95816
        re.compile('\d{2},\d{3}'),            # 86,782
        re.compile('([A-Z]\.){4}'),           # S.C.B.D. 3020
        re.compile('\d{2}-[a-z]{2}-\d{4}'),
    ]

    docket_number = None
    outer_break = False
    for t in text_elements:
        if outer_break:
            # Allows breaking the outer loop from the inner loop
            break
        t = clean_string(t).strip('.')
        for regex in regexes:
            if re.search(regex, t):
                docket_number = t
                outer_break = True
                break

    if docket_number:
        if docket_number.startswith('No.'):
            docket_number = docket_number[4:]
        elif docket_number.startswith('Nos.'):
            docket_number = docket_number[5:]
        elif docket_number.startswith('Docket No.'):
            docket_number = docket_number[11:]
        if re.search('^\(.*\)$', docket_number):
            # Starts and ends with parens. Nuke 'em.
            docket_number = docket_number[1:-1]

    if docket_number and re.search('submitted|reversed', docket_number, re.I):
        # False positive. Happens when there's no docket number and the date is incorrectly interpreted.
        docket_number = None
    elif docket_number == 'Not in Source':
        docket_number = None

    if not docket_number:
        try:
            docket_number = fixes[case_path]['docket_number']
        except KeyError:
            if 'northeastern' not in case_path and \
                    'federal_reporter/2d' not in case_path and \
                    court not in ['or', 'orctapp', 'cal'] and \
                    ('unsorted' not in case_path and court not in ['ind']) and \
                    ('pacific_reporter/2d' not in case_path and court not in ['calctapp']):
                # Lots of missing docket numbers here.
                if 'input_docket_number' in DEBUG:
                    subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                    docket_number = raw_input('  No docket number found. What should be here? ')
                    add_fix(case_path, {'docket_number': docket_number})
                if 'log_bad_docket_numbers' in DEBUG:
                    with open('missing_docket_numbers.txt', 'a') as out:
                        out.write('%s\n' % case_path)

    if 'docket_number' in DEBUG:
        log_print('  Docket Number: %s' % docket_number)
    return docket_number
Esempio n. 50
0
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [
         # Et al
         ["Lissner, et. al.", u"Lissner"],
         ["Lissner, et. al", u"Lissner"],
         ["Lissner, et al.", u"Lissner"],
         ["Lissner, et al", u"Lissner"],
         ["Lissner et. al.", u"Lissner"],
         ["Lissner et. al", u"Lissner"],
         ["Lissner et al.", u"Lissner"],
         ["Lissner et al", u"Lissner"],
         # US --> United States
         ["US v. Lissner, Plaintiff", u"United States v. Lissner"],
         ["US v. Lissner, Petitioner-appellant", u"United States v. Lissner"],
         ["United States, Petitioner, v. Lissner", u"United States v. Lissner"],
         [
             "United States of America, Plaintiff-Appellee, v. Orlando B. " "Pino, Defendant-Appellant, Joseph",
             u"United States v. Orlando B. Pino, Joseph",
         ],
         ["Herring v. U.S. **", u"Herring v. United States"],
         ["Test v. U.S", u"Test v. United States"],
         ["The United States v. Lissner", u"United States v. Lissner"],
         # Tests the output from a titlecased word containing
         # US to ensure it gets harmonized.
         ["Carver v. US", u"Carver v. United States"],
         # US Steel --> US Steel
         ["US Steel v.  US", u"US Steel v. United States"],
         ["US v. V.Vivack", u"United States v. V.Vivack"],
         ["US vs. Lissner", u"United States v. Lissner"],
         ["[email protected] vs. USA", u"[email protected] v. United States"],
         ["US v. US", u"United States v. United States"],
         ["US  Steel v.  US", u"US Steel v. United States"],
         ["U.S.A. v. Mr. v.", u"United States v. Mr. v."],
         ["U.S.S. v. Lissner", u"U.S.S. v. Lissner"],
         ["USC v. Lissner", u"USC v. Lissner"],
         ["U.S.C. v. Lissner", u"U.S.C. v. Lissner"],
         ["U.S. Steel v. Colgate", u"U.S. Steel v. Colgate"],
         ["U.S.A. v. Lissner", u"United States v. Lissner"],
         ["U.S. v. Lissner", u"United States v. Lissner"],
         ["U. S. v. Lissner", u"United States v. Lissner"],
         ["United States v. Lissner", u"United States v. Lissner"],
         ["Usa v. Lissner", u"United States v. Lissner"],
         ["USA v. Lissner", u"United States v. Lissner"],
         ["United States of America v. Lissner", u"United States v. Lissner"],
         ["Lissner v. United States of America", u"Lissner v. United States"],
         # tests no period in v.
         ["USA v White", u"United States v. White"],
         # tests no period in vs.
         ["USA vs White", u"United States v. White"],
         ["V.Vivack and Associates v. US", u"V.Vivack and Associates v. United States"],
         ["v.v. Hendricks & Sons v. James v. Smith", u"v.v. Hendricks & Sons v. James v. Smith"],
         # Normalize "The State"
         ["Aimee v. The State", u"Aimee v. State"],
         # Nuke Pet (short for petitioners)
         ["Commonwealth v. Mickle, V., Pet.", u"Commonwealth v. Mickle v."],
         # Unchanged, despite having the word Pet
         ["Pet Doctors inc. v. Spoon", u"Pet Doctors inc. v. Spoon"],
         # Nukes the No. and Nos., but not
         ["No. 23423", u"23423"],
         ["Nos. 23 and 232", u"23 and 232"],
         ["No Expletives Inc.", u"No Expletives Inc."],
         # Tests that "Nothing" doesn't get nuked.
         ["No. 232 Nothing 232", "232 Nothing 232"],
         # Garbage
         # leading slash.
         ["/USA vs White", u"United States v. White"],
         # unicode input
         ["12–1438-cr", u"12–1438-cr"],
         # Randoms
         ["clarinet alibi", u"clarinet alibi"],
         ["papusa", u"papusa"],
         ["CUSANO", u"CUSANO"],
         # Filter out invalid XML characters
         [u"Special Counsel ex rel. Karla Saunders", u"Special Counsel ex rel. Karla Saunders"],
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Esempio n. 51
0
def get_court_object(html, citations=None, case_path=None, judge=None):
    """
       Parse out the court string, somehow, and then map it back to our internal ids
    """
    def string_to_key(str):
        """Given a string, tries to map it to a court key."""
        # State
        for regex, value in state_pairs:
            if re.search(regex, str):
                return value

        # Supreme Court
        if re.search('Supreme Court of (the )?United States', str) or \
            re.search('United States Supreme Court', str):
            return 'scotus'

        # Federal appeals
        if re.search('Court,? of Appeal', str) or \
                                 'Circuit of Appeals' in str:
            if 'First Circuit' in str or \
                    'First District' in str:
                return 'ca1'
            elif 'Second Circuit' in str or \
                    'Second District' in str:
                return 'ca2'
            elif 'Third Circuit' in str:
                return 'ca3'
            elif 'Fourth Circuit' in str:
                return 'ca4'
            elif 'Fifth Circuit' in str:
                return 'ca5'
            elif 'Sixth Circuit' in str:
                return 'ca6'
            elif 'Seventh Circuit' in str:
                return 'ca7'
            elif 'Eighth' in str:  # Aka, apparently, "Eighth Court"
                return 'ca8'
            elif re.search('Ninth (Judicial )?Circuit', str):
                return 'ca9'
            elif 'Tenth Circuit' in str:
                return 'ca10'
            elif 'Eleventh Circuit' in str:
                return 'ca11'
            elif 'District of Columbia' in str:
                return 'cadc'
            elif 'Federal Circuit' in str:
                return 'cafc'
            elif 'Emergency' in str:
                return 'eca'
            elif 'Columbia' in str:
                return 'cadc'
        elif 'Judicial Council of the Eighth Circuit' in str:
            return 'ca8'
        elif 'Judicial Council of the Ninth Circuit' in str or \
                re.search('Ninth Judicial Circuit', str):
            return 'ca9'

        # Federal district
        elif re.search('(^| )Distr?in?ct', str, re.I):
            for regex, value in fd_pairs:
                if re.search(regex, str):
                    return value
        elif 'D. Virgin Islands' in str:
            return 'vid'
        elif 'Territorial Court' in str:
            if 'Virgin Islands' in str:
                return 'vid'

        # Federal special
        elif 'United States Judicial Conference Committee' in str or \
                'U.S. Judicial Conference Committee' in str:
            return 'usjc'
        elif re.search('Judicial Panel ((on)|(of)) Multidistrict Litigation', str, re.I):
            return 'jpml'
        elif 'Court of Customs and Patent Appeals' in str:
            return 'ccpa'
        elif 'Court of Claims' in str or \
            'Claims Court' in str:
            return 'cc'  # Cannot change
        elif 'United States Foreign Intelligence Surveillance Court' in str:
            return 'fiscr'  # Cannot change
        elif re.search('Court,? of,? International ?Trade', str):
            return 'cit'
        elif 'United States Customs Court' in str:
            return 'cusc'  # Cannot change?
        elif re.search('Special Court(\.|,)? Regional Rail Reorganization Act', str):
            return 'reglrailreorgct'
        elif re.search('Military Commission Review', str):
            return 'mc'

        # Bankruptcy Courts
        elif re.search('bankrup?tcy', str, re.I):
            # Bankruptcy Appellate Panels
            if re.search('Appellan?te Panel', str, re.I):
                if 'First Circuit' in str:
                    return 'bap1'
                elif 'Second Circuit' in str:
                    return 'bap2'
                elif 'Sixth Circuit' in str:
                    return 'bap6'
                elif 'Eighth Circuit' in str:
                    return 'bap8'
                elif 'Ninth Circuit' in str:
                    return 'bap9'
                elif 'Tenth Circuit' in str:
                    return 'bap10'
                elif 'Maine' in str:
                    return 'bapme'
                elif 'Massachusetts' in str:
                    return 'bapma'

            # Bankruptcy District Courts
            else:
                for regex, value in fb_pairs:
                    if re.search(regex, str):
                        return value
        else:
            return False

    path = '//center/p/b/text()'
    text_elements = html.xpath(path)
    court = None

    # 1: try using the citations as a clue (necessary first because calctapp calls itself simply, "Court of Appeal,
    # Second District")
    if citations:
        reporter_keys = [citation.canonical_reporter for citation in citations]
        if 'Cal. Rptr.' in reporter_keys or 'Cal. App.' in reporter_keys:
            # It's a california court, but which?
            for text_element in text_elements:
                text_element = clean_string(text_element).strip('.')
                if re.search('court of appeal', text_element, re.I):
                    court = 'calctapp'
                else:
                    court = 'cal'
        elif 'U.S.' in reporter_keys:
            court = 'scotus'

    # 2: Try using a bunch of regular expressions (this catches 95% of items)
    if not court:
        for text_element in text_elements:
            text_element = clean_string(text_element).strip('.')
            court = string_to_key(text_element)
            if court:
                break

    # 3: try the text elements joined together (works if there were line break problems)
    if not court:
        t = clean_string(' '.join(text_elements)).strip('.')
        court = string_to_key(t)

    # 4: Disambiguate by judge
    if not court and judge:
        court = disambiguate_by_judge(judge)
        if court and 'log_judge_disambiguations' in DEBUG:
            with open('disambiguated_by_judge.txt', 'a') as f:
                f.write('%s\t%s\t%s\n' % (case_path, court, judge.encode('ISO-8859-1')))

    # 5: give up.
    if not court:
        try:
            court = fixes[case_path]['court']
        except KeyError:
            if 'input_court' in DEBUG:
                if 'firefox' in DEBUG:
                    subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                court = raw_input("No court identified! What should be here? ")
                add_fix(case_path, {'court': input})
            if 'log_bad_courts' in DEBUG:
                # Write the failed case out to file.
                court = 'test'
                with open('missing_courts.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if 'court' in DEBUG:
        log_print('  Court: %s' % court)

    return court
Esempio n. 52
0
 def test_harmonize_and_clean_string_tests(self):
     """Tests various inputs for the clean_string and harmonize functions"""
     test_pairs = [['U.S.A. v. Lissner',
                    u'United States v. Lissner'],
                   ['U.S. v. Lissner',
                    u'United States v. Lissner'],
                   ['U. S. v. Lissner',
                    u'United States v. Lissner'],
                   ['United States v. Lissner',
                    u'United States v. Lissner'],
                   ['Usa v. Lissner',
                    u'United States v. Lissner'],
                   ['USA v. Lissner',
                    u'United States v. Lissner'],
                   ['United States of America v. Lissner',
                    u'United States v. Lissner'],
                   ['Lissner v. United States of America',
                    u'Lissner v. United States'],
                   ['V.Vivack and Associates v. US',
                    u'V.Vivack and Associates v. United States'],
                   ['v.v. Hendricks & Sons v. James v. Smith',
                    u'v.v. Hendricks & Sons v. James v. Smith'],
                   ['U.S.A. v. Mr. v.',
                    u'United States v. Mr. v.'],
                   ['U.S.S. v. Lissner',
                    u'U.S.S. v. Lissner'],
                   ['USC v. Lissner',
                    u'USC v. Lissner'],
                   ['U.S.C. v. Lissner',
                    u'U.S.C. v. Lissner'],
                   ['U.S. Steel v. Colgate',
                    u'U.S. Steel v. Colgate'],
                   ['papusa',
                    u'papusa'],
                   ['CUSANO',
                    u'CUSANO'],
                   ['US Steel v.  US',
                    u'US Steel v. United States'],
                   ['US v. V.Vivack',
                    u'United States v. V.Vivack'],
                   ['US vs. Lissner',
                    u'United States v. Lissner'],
                   ['[email protected] vs. USA',
                    u'[email protected] v. United States'],
                   ['US v. US',
                    u'United States v. United States'],
                   ['US  Steel v.  US',
                    u'US Steel v. United States'],
                   ['Lissner, et. al.',
                    u'Lissner'],
                   ['Lissner, et. al',
                    u'Lissner'],
                   ['Lissner, et al.',
                    u'Lissner'],
                   ['Lissner, et al',
                    u'Lissner'],
                   ['Lissner et. al.',
                    u'Lissner'],
                   ['Lissner et. al',
                    u'Lissner'],
                   ['Lissner et al.',
                    u'Lissner'],
                   ['Lissner et al',
                    u'Lissner'],
                   ['clarinet alibi',
                    u'clarinet alibi'],
                   ['US v. Lissner, Plaintiff',
                    u'United States v. Lissner'],
                   ['US v. Lissner, Petitioner-appellant',
                    u'United States v. Lissner'],
                   ['United States, Petitioner, v. Lissner',
                    u'United States v. Lissner'],
                   [
                       'United States of America, Plaintiff-Appellee, v. Orlando B. Pino, Defendant-Appellant, Joseph',
                       u'United States v. Orlando B. Pino, Joseph'],
                   ['Herring v. U.S. **',
                    u'Herring v. United States'],
                   ['Test v. U.S',
                    u'Test v. United States'],
                   ['The United States v. Lissner',
                    u'United States v. Lissner'],
                   ['USA v White',  # tests no period in v.
                    u'United States v. White'],
                   ['USA vs White',  # tests no period in vs.
                    u'United States v. White'],
                   ['/USA vs White',  # tests leading slash.
                    u'United States v. White'],
                   ['12–1438-cr',  # tests unicode input
                    u'12–1438-cr'],
                   ['Carver v. US',
                    # Tests the output from a titlecased word containing US to ensure it gets
                    # harmonized.
                    u'Carver v. United States'],
                   ['Aimee v. The State',  # Normalize "The State"
                    u'Aimee v. State'],
                   ['Commonwealth v. Mickle, V., Pet.',
                    u'Commonwealth v. Mickle v.'],
                   # Nuke Pet (short for petitioners)
                   ['Pet Doctors inc. v. Spoon',
                    u'Pet Doctors inc. v. Spoon'],
                   # Unchanged, despite having the word Pet
     ]
     for pair in test_pairs:
         self.assertEqual(harmonize(clean_string(pair[0])), pair[1])
Esempio n. 53
0
def get_court_object(html, citations=None, case_path=None, judge=None):
    """
       Parse out the court string, somehow, and then map it back to our internal ids
    """
    def string_to_key(str):
        """Given a string, tries to map it to a court key."""
        # State
        for regex, value in state_pairs:
            if re.search(regex, str):
                return value

        # Supreme Court
        if re.search('Supreme Court of (the )?United States', str) or \
            re.search('United States Supreme Court', str):
            return 'scotus'

        # Federal appeals
        if re.search('Court,? of Appeal', str) or \
                                 'Circuit of Appeals' in str:
            if 'First Circuit' in str or \
                    'First District' in str:
                return 'ca1'
            elif 'Second Circuit' in str or \
                    'Second District' in str:
                return 'ca2'
            elif 'Third Circuit' in str:
                return 'ca3'
            elif 'Fourth Circuit' in str:
                return 'ca4'
            elif 'Fifth Circuit' in str:
                return 'ca5'
            elif 'Sixth Circuit' in str:
                return 'ca6'
            elif 'Seventh Circuit' in str:
                return 'ca7'
            elif 'Eighth' in str:  # Aka, apparently, "Eighth Court"
                return 'ca8'
            elif re.search('Ninth (Judicial )?Circuit', str):
                return 'ca9'
            elif 'Tenth Circuit' in str:
                return 'ca10'
            elif 'Eleventh Circuit' in str:
                return 'ca11'
            elif 'District of Columbia' in str:
                return 'cadc'
            elif 'Federal Circuit' in str:
                return 'cafc'
            elif 'Emergency' in str:
                return 'eca'
            elif 'Columbia' in str:
                return 'cadc'
        elif 'Judicial Council of the Eighth Circuit' in str:
            return 'ca8'
        elif 'Judicial Council of the Ninth Circuit' in str or \
                re.search('Ninth Judicial Circuit', str):
            return 'ca9'

        # Federal district
        elif re.search('(^| )Distr?in?ct', str, re.I):
            for regex, value in fd_pairs:
                if re.search(regex, str):
                    return value
        elif 'D. Virgin Islands' in str:
            return 'vid'
        elif 'Territorial Court' in str:
            if 'Virgin Islands' in str:
                return 'vid'

        # Federal special
        elif 'United States Judicial Conference Committee' in str or \
                'U.S. Judicial Conference Committee' in str:
            return 'usjc'
        elif re.search('Judicial Panel ((on)|(of)) Multidistrict Litigation',
                       str, re.I):
            return 'jpml'
        elif 'Court of Customs and Patent Appeals' in str:
            return 'ccpa'
        elif 'Court of Claims' in str or \
            'Claims Court' in str:
            return 'cc'  # Cannot change
        elif 'United States Foreign Intelligence Surveillance Court' in str:
            return 'fiscr'  # Cannot change
        elif re.search('Court,? of,? International ?Trade', str):
            return 'cit'
        elif 'United States Customs Court' in str:
            return 'cusc'  # Cannot change?
        elif re.search('Special Court(\.|,)? Regional Rail Reorganization Act',
                       str):
            return 'reglrailreorgct'
        elif re.search('Military Commission Review', str):
            return 'mc'

        # Bankruptcy Courts
        elif re.search('bankrup?tcy', str, re.I):
            # Bankruptcy Appellate Panels
            if re.search('Appellan?te Panel', str, re.I):
                if 'First Circuit' in str:
                    return 'bap1'
                elif 'Second Circuit' in str:
                    return 'bap2'
                elif 'Sixth Circuit' in str:
                    return 'bap6'
                elif 'Eighth Circuit' in str:
                    return 'bap8'
                elif 'Ninth Circuit' in str:
                    return 'bap9'
                elif 'Tenth Circuit' in str:
                    return 'bap10'
                elif 'Maine' in str:
                    return 'bapme'
                elif 'Massachusetts' in str:
                    return 'bapma'

            # Bankruptcy District Courts
            else:
                for regex, value in fb_pairs:
                    if re.search(regex, str):
                        return value
        else:
            return False

    path = '//center/p/b/text()'
    text_elements = html.xpath(path)
    court = None

    # 1: try using the citations as a clue (necessary first because calctapp calls itself simply, "Court of Appeal,
    # Second District")
    if citations:
        reporter_keys = [citation.canonical_reporter for citation in citations]
        if 'Cal. Rptr.' in reporter_keys or 'Cal. App.' in reporter_keys:
            # It's a california court, but which?
            for text_element in text_elements:
                text_element = clean_string(text_element).strip('.')
                if re.search('court of appeal', text_element, re.I):
                    court = 'calctapp'
                else:
                    court = 'cal'
        elif 'U.S.' in reporter_keys:
            court = 'scotus'

    # 2: Try using a bunch of regular expressions (this catches 95% of items)
    if not court:
        for text_element in text_elements:
            text_element = clean_string(text_element).strip('.')
            court = string_to_key(text_element)
            if court:
                break

    # 3: try the text elements joined together (works if there were line break problems)
    if not court:
        t = clean_string(' '.join(text_elements)).strip('.')
        court = string_to_key(t)

    # 4: Disambiguate by judge
    if not court and judge:
        court = disambiguate_by_judge(judge)
        if court and 'log_judge_disambiguations' in DEBUG:
            with open('disambiguated_by_judge.txt', 'a') as f:
                f.write('%s\t%s\t%s\n' %
                        (case_path, court, judge.encode('ISO-8859-1')))

    # 5: give up.
    if not court:
        try:
            court = fixes[case_path]['court']
        except KeyError:
            if 'input_court' in DEBUG:
                if 'firefox' in DEBUG:
                    subprocess.Popen(
                        ['firefox', 'file://%s' % case_path],
                        shell=False).communicate()
                court = raw_input("No court identified! What should be here? ")
                add_fix(case_path, {'court': input})
            if 'log_bad_courts' in DEBUG:
                # Write the failed case out to file.
                court = 'test'
                with open('missing_courts.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if 'court' in DEBUG:
        log_print('  Court: %s' % court)

    return court
Esempio n. 54
0
 def sanitize_text(self, text):
     """Prevent non-standard characters and typos from breaking regex"""
     return self.fix_court_year_id_typo(clean_string(normalize_dashes(text)))
Esempio n. 55
0
    def _get_case_name_and_status(self):
        case_name = self.url_element.get('title').lower()
        ca1regex = re.compile(
            '(unpublished disposition )?notice: first circuit local rule 36.2\(b\)6 states unpublished opinions may be cited only in related cases.?'
        )
        ca2regex = re.compile(
            '(unpublished disposition )?notice: second circuit local rule 0.23 states unreported opinions shall not be cited or otherwise used in unrelated cases.?'
        )
        ca2regex2 = re.compile(
            '(unpublished disposition )?notice: this summary order may not be cited as precedential authority, but may be called to the attention of the court in a subsequent stage of this case, in a related case, or in any case for purposes of collateral estoppel or res judicata. see second circuit rule 0.23.?'
        )
        ca3regex = re.compile(
            '(unpublished disposition )?notice: third circuit rule 21\(i\) states citations to federal decisions which have not been formally reported should identify the court, docket number and date.?'
        )
        ca4regex = re.compile(
            '(unpublished disposition )?notice: fourth circuit (local rule 36\(c\)|i.o.p. 36.6) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the fourth circuit.?'
        )
        ca5regex = re.compile(
            '(unpublished disposition )?notice: fifth circuit local rule 47.5.3 states that unpublished opinions should normally be cited only when they establish the law of the case, are relied upon as a basis for res judicata or collateral estoppel, or involve related facts. if an unpublished opinion is cited, a copy shall be attached to each copy of the brief.?'
        )
        ca6regex = re.compile(
            '(unpublished disposition )?notice: sixth circuit rule 24\(c\) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the sixth circuit.?'
        )
        ca7regex = re.compile(
            '(unpublished disposition )?notice: seventh circuit rule 53\(b\)\(2\) states unpublished orders shall not be cited or used as precedent except to support a claim of res judicata, collateral estoppel or law of the case in any federal court within the circuit.?'
        )
        ca8regex = re.compile(
            '(unpublished disposition )?notice: eighth circuit rule 28a\(k\) governs citation of unpublished opinions and provides that (no party may cite an opinion not intended for publication unless the cases are related by identity between the parties or the causes of action|they are not precedent and generally should not be cited unless relevant to establishing the doctrines of res judicata, collateral estoppel, the law of the case, or if the opinion has persuasive value on a material issue and no published opinion would serve as well).?'
        )
        ca9regex = re.compile(
            '(unpublished disposition )?notice: ninth circuit rule 36-3 provides that dispositions other than opinions or orders designated for publication are not precedential and should not be cited except when relevant under the doctrines of law of the case, res judicata, or collateral estoppel.?'
        )
        ca10regex = re.compile(
            '(unpublished disposition )?notice: tenth circuit rule 36.3 states that unpublished opinions and orders and judgments have no precedential value and shall not be cited except for purposes of establishing the doctrines of the law of the case, res judicata, or collateral estoppel.?'
        )
        cadcregex = re.compile(
            '(unpublished disposition )?notice: d.c. circuit local rule 11\(c\) states that unpublished orders, judgments, and explanatory memoranda may not be cited as precedents, but counsel may refer to unpublished dispositions when the binding or preclusive effect of the disposition, rather than its quality as precedent, is relevant.?'
        )
        cafcregex = re.compile(
            '(unpublished disposition )?notice: federal circuit local rule 47.(6|8)\(b\) states that opinions and orders which are designated as not citable as precedent shall not be employed or cited as precedent. this does not preclude assertion of issues of claim preclusion, issue preclusion, judicial estoppel, law of the case or the like based on a decision of the court rendered in a nonprecedential opinion or order.?'
        )
        # Clean off special cases
        if 'first circuit' in case_name:
            case_name = re.sub(ca1regex, '', case_name)
            status = 'Unpublished'
        elif 'second circuit' in case_name:
            case_name = re.sub(ca2regex, '', case_name)
            case_name = re.sub(ca2regex2, '', case_name)
            status = 'Unpublished'
        elif 'third circuit' in case_name:
            case_name = re.sub(ca3regex, '', case_name)
            status = 'Unpublished'
        elif 'fourth circuit' in case_name:
            case_name = re.sub(ca4regex, '', case_name)
            status = 'Unpublished'
        elif 'fifth circuit' in case_name:
            case_name = re.sub(ca5regex, '', case_name)
            status = 'Unpublished'
        elif 'sixth circuit' in case_name:
            case_name = re.sub(ca6regex, '', case_name)
            status = 'Unpublished'
        elif 'seventh circuit' in case_name:
            case_name = re.sub(ca7regex, '', case_name)
            status = 'Unpublished'
        elif 'eighth circuit' in case_name:
            case_name = re.sub(ca8regex, '', case_name)
            status = 'Unpublished'
        elif 'ninth circuit' in case_name:
            case_name = re.sub(ca9regex, '', case_name)
            status = 'Unpublished'
        elif 'tenth circuit' in case_name:
            case_name = re.sub(ca10regex, '', case_name)
            status = 'Unpublished'
        elif 'd.c. circuit' in case_name:
            case_name = re.sub(cadcregex, '', case_name)
            status = 'Unpublished'
        elif 'federal circuit' in case_name:
            case_name = re.sub(cafcregex, '', case_name)
            status = 'Unpublished'
        else:
            status = 'Published'

        case_name = titlecase(harmonize(clean_string(case_name)))

        if case_name == '' or case_name == 'unpublished disposition':
            # No luck getting the case name
            saved_case_name = self._check_fix_list(self.sha1_hash,
                                                   self.case_name_dict)
            if saved_case_name:
                case_name = saved_case_name
            else:
                print self.url
                if BROWSER:
                    subprocess.Popen([BROWSER, self.url],
                                     shell=False).communicate()
                case_name = raw_input("Short case name: ")
                self.case_name_fix_file.write("%s|%s\n" %
                                              (self.sha1_hash, case_name))

        return case_name, status
Esempio n. 56
0
    def _get_case_name_and_status(self):
        case_name = self.url_element.get('title').lower()
        ca1regex = re.compile('(unpublished disposition )?notice: first circuit local rule 36.2\(b\)6 states unpublished opinions may be cited only in related cases.?')
        ca2regex = re.compile('(unpublished disposition )?notice: second circuit local rule 0.23 states unreported opinions shall not be cited or otherwise used in unrelated cases.?')
        ca2regex2 = re.compile('(unpublished disposition )?notice: this summary order may not be cited as precedential authority, but may be called to the attention of the court in a subsequent stage of this case, in a related case, or in any case for purposes of collateral estoppel or res judicata. see second circuit rule 0.23.?')
        ca3regex = re.compile('(unpublished disposition )?notice: third circuit rule 21\(i\) states citations to federal decisions which have not been formally reported should identify the court, docket number and date.?')
        ca4regex = re.compile('(unpublished disposition )?notice: fourth circuit (local rule 36\(c\)|i.o.p. 36.6) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the fourth circuit.?')
        ca5regex = re.compile('(unpublished disposition )?notice: fifth circuit local rule 47.5.3 states that unpublished opinions should normally be cited only when they establish the law of the case, are relied upon as a basis for res judicata or collateral estoppel, or involve related facts. if an unpublished opinion is cited, a copy shall be attached to each copy of the brief.?')
        ca6regex = re.compile('(unpublished disposition )?notice: sixth circuit rule 24\(c\) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the sixth circuit.?')
        ca7regex = re.compile('(unpublished disposition )?notice: seventh circuit rule 53\(b\)\(2\) states unpublished orders shall not be cited or used as precedent except to support a claim of res judicata, collateral estoppel or law of the case in any federal court within the circuit.?')
        ca8regex = re.compile('(unpublished disposition )?notice: eighth circuit rule 28a\(k\) governs citation of unpublished opinions and provides that (no party may cite an opinion not intended for publication unless the cases are related by identity between the parties or the causes of action|they are not precedent and generally should not be cited unless relevant to establishing the doctrines of res judicata, collateral estoppel, the law of the case, or if the opinion has persuasive value on a material issue and no published opinion would serve as well).?')
        ca9regex = re.compile('(unpublished disposition )?notice: ninth circuit rule 36-3 provides that dispositions other than opinions or orders designated for publication are not precedential and should not be cited except when relevant under the doctrines of law of the case, res judicata, or collateral estoppel.?')
        ca10regex = re.compile('(unpublished disposition )?notice: tenth circuit rule 36.3 states that unpublished opinions and orders and judgments have no precedential value and shall not be cited except for purposes of establishing the doctrines of the law of the case, res judicata, or collateral estoppel.?')
        cadcregex = re.compile('(unpublished disposition )?notice: d.c. circuit local rule 11\(c\) states that unpublished orders, judgments, and explanatory memoranda may not be cited as precedents, but counsel may refer to unpublished dispositions when the binding or preclusive effect of the disposition, rather than its quality as precedent, is relevant.?')
        cafcregex = re.compile('(unpublished disposition )?notice: federal circuit local rule 47.(6|8)\(b\) states that opinions and orders which are designated as not citable as precedent shall not be employed or cited as precedent. this does not preclude assertion of issues of claim preclusion, issue preclusion, judicial estoppel, law of the case or the like based on a decision of the court rendered in a nonprecedential opinion or order.?')
        # Clean off special cases
        if 'first circuit' in case_name:
            case_name = re.sub(ca1regex, '', case_name)
            status = 'Unpublished'
        elif 'second circuit' in case_name:
            case_name = re.sub(ca2regex, '', case_name)
            case_name = re.sub(ca2regex2, '', case_name)
            status = 'Unpublished'
        elif 'third circuit' in case_name:
            case_name = re.sub(ca3regex, '', case_name)
            status = 'Unpublished'
        elif 'fourth circuit' in case_name:
            case_name = re.sub(ca4regex, '', case_name)
            status = 'Unpublished'
        elif 'fifth circuit' in case_name:
            case_name = re.sub(ca5regex, '', case_name)
            status = 'Unpublished'
        elif 'sixth circuit' in case_name:
            case_name = re.sub(ca6regex, '', case_name)
            status = 'Unpublished'
        elif 'seventh circuit' in case_name:
            case_name = re.sub(ca7regex, '', case_name)
            status = 'Unpublished'
        elif 'eighth circuit' in case_name:
            case_name = re.sub(ca8regex, '', case_name)
            status = 'Unpublished'
        elif 'ninth circuit' in case_name:
            case_name = re.sub(ca9regex, '', case_name)
            status = 'Unpublished'
        elif 'tenth circuit' in case_name:
            case_name = re.sub(ca10regex, '', case_name)
            status = 'Unpublished'
        elif 'd.c. circuit' in case_name:
            case_name = re.sub(cadcregex, '', case_name)
            status = 'Unpublished'
        elif 'federal circuit' in case_name:
            case_name = re.sub(cafcregex, '', case_name)
            status = 'Unpublished'
        else:
            status = 'Published'

        case_name = titlecase(harmonize(clean_string(case_name)))

        if case_name == '' or case_name == 'unpublished disposition':
            # No luck getting the case name
            saved_case_name = self._check_fix_list(self.sha1_hash, self.case_name_dict)
            if saved_case_name:
                case_name = saved_case_name
            else:
                print self.url
                if BROWSER:
                    subprocess.Popen([BROWSER, self.url], shell=False).communicate()
                case_name = raw_input("Short case name: ")
                self.case_name_fix_file.write("%s|%s\n" % (self.sha1_hash, case_name))

        return case_name, status
Esempio n. 57
0
 def _get_case_dates(self):
     dates = []
     for date_string in self.html.xpath('//table/tbody/tr/td[1]/text()'):
         dates.append(date.fromtimestamp(time.mktime(time.strptime(clean_string(date_string), '%m/%d/%Y'))))
     return dates