def get_citations(text, html=True, do_post_citation=True, do_defendant=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page number after the reporter.
    for i in xrange(1, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
                continue
            if do_post_citation:
                add_post_citation(citation, words, i)
            if do_defendant:
                add_defendant(citation, words, i)
            citations.append(citation)

    # Disambiguate or drop all the reporters
    citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = 'scotus'

    return citations
Exemple #2
0
def get_citations(text, html=True, do_post_citation=True, do_defendant=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page number after the reporter.
    for i in xrange(1, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
                continue
            if do_post_citation:
                add_post_citation(citation, words, i)
            if do_defendant:
                add_defendant(citation, words, i)
            citations.append(citation)

    # Disambiguate or drop all the reporters
    citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = 'scotus'

    return citations
Exemple #3
0
    def test_for_variations_mapping_to_bad_keys(self):
        """It's possible to have a variation that maps to a key that doesn't exist in the first place.

        Check that never happens.
        """
        for variations in VARIATIONS_ONLY.values():
            for variation in variations:
                self.assertIn(EDITIONS[variation], REPORTERS.keys(),
                              msg="Could not map variation to a valid reporter: %s" % variation)
Exemple #4
0
    def test_for_variations_mapping_to_bad_keys(self):
        """It's possible to have a variation that maps to a key that doesn't exist in the first place.

        Check that never happens.
        """
        for variations in VARIATIONS_ONLY.values():
            for variation in variations:
                self.assertIn(
                    EDITIONS[variation],
                    REPORTERS.keys(),
                    msg="Could not map variation to a valid reporter: %s" %
                    variation)
def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a reporter.

    Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be
    accomplished using the year of the item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved the same way --> BAD!
            #       Once fixed, it will probably need to be removed from the tokenizer, and moved
            #       down the pipeline.
            return VARIATIONS_ONLY[string][0]
    else:
        # Not a variant
        return string
Exemple #6
0
def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a reporter.

    Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be
    accomplished using the year of the item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved the same way --> BAD!
            #       Once fixed, it will probably need to be removed from the tokenizer, and moved
            #       down the pipeline.
            return VARIATIONS_ONLY[string][0]
    else:
        # Not a variant
        return string
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
         -Split the text by the occurrences of patterns which match a federal
          reporter, including the reporter strings as part of the resulting list.
         -Perform simple tokenization (whitespace split) on each of the non-reporter
          strings in the list.

       Example:
       >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
       ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)']
    """
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in EDITIONS.keys() + VARIATIONS_ONLY.keys():
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
Exemple #8
0
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
         -Split the text by the occurrences of patterns which match a federal
          reporter, including the reporter strings as part of the resulting list.
         -Perform simple tokenization (whitespace split) on each of the non-reporter
          strings in the list.

       Example:
       >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
       ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)']
    """
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in EDITIONS.keys() + VARIATIONS_ONLY.keys():
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
#!/usr/bin/env python
# encoding: utf-8

# Loosely adapted from the Natural Language Toolkit: Tokenizers
# URL: <http://nltk.sourceforge.net>

import re
from alert.citations.constants import EDITIONS, VARIATIONS_ONLY

# We need to build a REGEX that has all the variations and the reporters in order from longest to shortest.
REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys()
REGEX_LIST.sort(key=len, reverse=True)
REGEX_STR = '|'.join(map(re.escape, REGEX_LIST))
REPORTER_RE = re.compile("(%s)" % REGEX_STR)


def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a reporter.

    Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be
    accomplished using the year of the item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved the same way --> BAD!
            #       Once fixed, it will probably need to be removed from the tokenizer, and moved
            #       down the pipeline.
Exemple #10
0
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key, which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(
                            0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(
                                REPORTERS[EDITIONS[citation.reporter]][i]
                            ['editions'], citation.year):
                            possible_citations.append((
                                citation.reporter,
                                i,
                            ))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[
                    citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                                                   Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(
                                0,
                                len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(
                                    REPORTERS[citation.canonical_reporter][i]
                                ['editions'], citation.year):
                                possible_citations.append((
                                    citation.reporter,
                                    i,
                                ))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(
                            0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[
                                citation.
                                canonical_reporter][i]['variations'].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(
                                REPORTERS[EDITIONS[reporter_key]][i]
                            ['editions'], citation.year):
                            possible_citations.append((
                                reporter_key,
                                i,
                            ))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[
                        possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
Exemple #11
0
#!/usr/bin/env python
# encoding: utf-8

# Loosely adapted from the Natural Language Toolkit: Tokenizers
# URL: <http://nltk.sourceforge.net>

import re
from alert.citations.constants import EDITIONS, VARIATIONS_ONLY

# We need to build a REGEX that has all the variations and the reporters in order from longest to shortest.
REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys()
REGEX_LIST.sort(key=len, reverse=True)
REGEX_STR = '|'.join(map(re.escape, REGEX_LIST))
REPORTER_RE = re.compile("(%s)" % REGEX_STR)


def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a reporter.

    Variations map to lists of one or more result, and we need to figure out which is best. Usually, this can be
    accomplished using the year of the item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved the same way --> BAD!
            #       Once fixed, it will probably need to be removed from the tokenizer, and moved
            #       down the pipeline.
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key, which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]['editions'], citation.year):
                            possible_citations.append((citation.reporter, i,))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                                                   Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(REPORTERS[citation.canonical_reporter][i]['editions'],
                                                   citation.year):
                                possible_citations.append((citation.reporter, i,))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[citation.canonical_reporter][i]['variations'].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]['editions'], citation.year):
                            possible_citations.append((reporter_key, i,))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations