def extract_base_citation(words, reporter_index):
    """Construct and return a citation object from a list of "words"

    Given a list of words and the index of a federal reporter, look before and
    after for volume and page.  If found, construct and return a
    Citation object.
    """
    volume = strip_punct(words[reporter_index - 1])
    if volume.isdigit():
        volume = int(volume)
    else:
        # No volume, therefore not a valid citation
        return None

    page = strip_punct(words[reporter_index + 1])
    if page.isdigit():
        # Most page numbers will be digits.
        page = int(page)
    else:
        if isroman(page):
            # Some places like Nebraska have Roman numerals, e.g. in
            # '250 Neb. xxiv (1996)'. No processing needed.
            pass
        elif re.match('\d{1,6}[-]?[a-zA-Z]{1,6}', page):
            # Some places, like Connecticut, have pages like "13301-M".
            # Other places, like Illinois have "pages" like "110311-B".
            pass
        else:
            # Not Roman, and not a weird connecticut page number.
            return None

    reporter = words[reporter_index]
    return Citation(reporter, page, volume, reporter_found=reporter,
                    reporter_index=reporter_index)
예제 #2
0
def parse_page(page):
    page = strip_punct(page)

    if page.isdigit():
        # First, check whether the page is a simple digit. Most will be.
        return int(page)
    else:
        # Otherwise, check whether the "page" is really one of the following:
        # (ordered in descending order of likelihood)
        # 1) A numerical page range. E.g., "123-124"
        # 2) A roman numeral. E.g., "250 Neb. xxiv (1996)"
        # 3) A special Connecticut or Illinois number. E.g., "13301-M"
        # 4) A page with a weird suffix. E.g., "559 N.W.2d 826|N.D."
        # 5) A page with a ¶ symbol, star, and/or colon. E.g., "¶ 119:12-14"
        match = (
            re.match(r"\d{1,6}-\d{1,6}", page)  # Simple page range
            or isroman(page)  # Roman numeral
            or re.match(r"\d{1,6}[-]?[a-zA-Z]{1,6}", page)  # CT/IL page
            or re.match(r"\d{1,6}", page)  # Weird suffix
            or re.match(r"[*\u00b6\ ]*[0-9:\-]+", page)  # ¶, star, colon
        )
        if match:
            return match.group(0)
        else:
            return None
def extract_base_citation(words, reporter_index):
    """Construct and return a citation object from a list of "words"

    Given a list of words and the index of a federal reporter, look before and
    after for volume and page.  If found, construct and return a
    Citation object.
    """
    volume = strip_punct(words[reporter_index - 1])
    if volume.isdigit():
        volume = int(volume)
    else:
        # No volume, therefore not a valid citation
        return None

    page = strip_punct(words[reporter_index + 1])
    if page.isdigit():
        # Most page numbers will be digits.
        page = int(page)
    else:
        if isroman(page):
            # Some places like Nebraska have Roman numerals, e.g. in
            # '250 Neb. xxiv (1996)'. No processing needed.
            pass
        elif re.match('\d{1,5}[-]?[a-zA-Z]{1,6}', page):
            # Some places, like Connecticut, have pages like "13301-M"
            pass
        else:
            # Not Roman, and not a weird connecticut page number.
            return None

    reporter = words[reporter_index]
    return Citation(reporter, page, volume, reporter_found=reporter,
                    reporter_index=reporter_index)
예제 #4
0
def extract_base_citation(words, reporter_index):
    """Construct and return a citation object from a list of "words"

    Given a list of words and the index of a federal reporter, look before and
    after for volume and page.  If found, construct and return a
    Citation object.

    If we are given neutral, tax court opinions we treat them differently.
    The formats often follow {REPORTER} {YEAR}-{ITERATIVE_NUMBER}
    ex. T.C. Memo. 2019-13
    """
    reporter = words[reporter_index]
    neutral_tc_reporter = is_neutral_tc_reporter(reporter)
    if neutral_tc_reporter:
        volume, page = (words[reporter_index + 1].encode("utf-8").replace(
            "–", "-").split("-"))
    else:
        # "Normal" reporter: XX F.2d YY
        if reporter_index == 0:
            return None
        volume = strip_punct(words[reporter_index - 1])
        page = strip_punct(words[reporter_index + 1])

    # Normalize volume and page
    if volume.isdigit():
        volume = int(volume)
    else:
        # No volume, therefore not a valid citation
        return None

    if page.isdigit():
        # Most page numbers will be digits.
        page = int(page)
    else:
        if isroman(page):
            # Some places like Nebraska have Roman numerals, e.g. in
            # '250 Neb. xxiv (1996)'. No processing needed.
            pass
        elif re.match(r"\d{1,6}[-]?[a-zA-Z]{1,6}", page):
            # Some places, like Connecticut, have pages like "13301-M".
            # Other places, like Illinois have "pages" like "110311-B".
            pass
        else:
            # Not Roman, and not a weird connecticut page number. Thus a bad
            # value. Abort.
            return None

    return Citation(
        reporter,
        page,
        volume,
        reporter_found=reporter,
        reporter_index=reporter_index,
    )