Example #1
0
class FinancialDisclosure(models.Model):
    """A simple table to hold references to financial disclosure forms"""
    THUMBNAIL_NEEDED = 0
    THUMBNAIL_COMPLETE = 1
    THUMBNAIL_FAILED = 2
    THUMBNAIL_STATUSES = (
        (THUMBNAIL_NEEDED, "Thumbnail needed"),
        (THUMBNAIL_COMPLETE, "Thumbnail completed successfully"),
        (THUMBNAIL_FAILED, 'Unable to generate thumbnail'),
    )
    person = models.ForeignKey(
        Person,
        help_text="The person that the document is associated with.",
        related_name='financial_disclosures',
    )
    year = models.SmallIntegerField(
        help_text="The year that the disclosure corresponds with",
        db_index=True,
    )
    filepath = models.FileField(
        help_text="The disclosure report itself",
        upload_to='financial-disclosures/',
        storage=IncrementingFileSystemStorage(),
        db_index=True,
    )
    thumbnail = models.FileField(
        help_text="A thumbnail of the first page of the disclosure form",
        upload_to="financial-disclosures/thumbnails/",
        storage=IncrementingFileSystemStorage(),
        null=True,
        blank=True,
    )
    thumbnail_status = models.SmallIntegerField(
        help_text="The status of the thumbnail generation",
        choices=THUMBNAIL_STATUSES,
        default=0,
    )
    page_count = models.SmallIntegerField(
        help_text="The number of pages in the disclosure report",
    )

    class Meta:
        ordering = ('-year',)

    def save(self, *args, **kwargs):
        super(FinancialDisclosure, self).save(*args, **kwargs)
        if not self.pk:
            from cl.people_db.tasks import make_png_thumbnail_from_pdf
            make_png_thumbnail_from_pdf.delay(self.pk)
Example #2
0
class AbstractPDF(models.Model):
    """An abstract model to hold PDF-related information"""

    OCR_COMPLETE = 1
    OCR_UNNECESSARY = 2
    OCR_FAILED = 3
    OCR_NEEDED = 4
    OCR_STATUSES = (
        (OCR_COMPLETE, "OCR Complete"),
        (OCR_UNNECESSARY, "OCR Not Necessary"),
        (OCR_FAILED, "OCR Failed"),
        (OCR_NEEDED, "OCR Needed"),
    )
    date_created = models.DateTimeField(
        help_text="The date the file was imported to Local Storage.",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="Timestamp of last update.",
        auto_now=True,
        db_index=True,
    )
    sha1 = models.CharField(
        help_text="The ID used for a document in RECAP",
        max_length=40,  # As in RECAP
        blank=True,
    )
    page_count = models.IntegerField(
        help_text="The number of pages in the document, if known",
        blank=True,
        null=True,
    )
    file_size = models.IntegerField(
        help_text="The size of the file in bytes, if known",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the file in the local storage area.",
        upload_to=make_pdf_path,
        storage=IncrementingFileSystemStorage(),
        max_length=1000,
        db_index=True,
        blank=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    ia_upload_failure_count = models.SmallIntegerField(
        help_text="Number of times the upload to the Internet Archive failed.",
        null=True,
        blank=True,
    )
    thumbnail = models.FileField(
        help_text="A thumbnail of the first page of the document",
        upload_to=make_pdf_thumb_path,
        storage=IncrementingFileSystemStorage(),
        null=True,
        blank=True,
    )
    thumbnail_status = models.SmallIntegerField(
        help_text="The status of the thumbnail generation",
        choices=THUMBNAIL_STATUSES.NAMES,
        default=THUMBNAIL_STATUSES.NEEDED,
    )
    plain_text = models.TextField(
        help_text="Plain text of the document after extraction using "
        "pdftotext, wpd2txt, etc.",
        blank=True,
    )
    ocr_status = models.SmallIntegerField(
        help_text="The status of OCR processing on this item.",
        choices=OCR_STATUSES,
        null=True,
        blank=True,
    )

    class Meta:
        abstract = True
Example #3
0
class Opinion(models.Model):
    OPINION_TYPES = (
        ('010combined', 'Combined Opinion'),
        ('020lead', 'Lead Opinion'),
        ('030concurrence', 'Concurrence'),
        ('040dissent', 'Dissent'),
    )
    cluster = models.ForeignKey(
        OpinionCluster,
        help_text="The cluster that the opinion is a part of",
        related_name="sub_opinions",
    )
    opinions_cited = models.ManyToManyField(
        'self',
        help_text="Opinions cited by this opinion",
        through='OpinionsCited',
        through_fields=('citing_opinion', 'cited_opinion'),
        symmetrical=False,
        related_name="opinions_citing",
        blank=True,
    )
    author = models.ForeignKey(
        'people_db.Person',
        help_text="The primary author of this opinion",
        related_name='opinions_written',
        blank=True,
        null=True,
    )
    per_curiam = models.BooleanField(
        help_text="Is this opinion per curiam, without a single author?",
        default=False,
    )
    joined_by = models.ManyToManyField(
        'people_db.Person',
        related_name='opinions_joined',
        help_text="Other judges that joined the primary author in this opinion",
        blank=True,
    )
    date_created = models.DateTimeField(
        help_text="The original creation date for the item",
        auto_now_add=True,
        db_index=True)
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified. A value in "
        "year 1750 indicates the value is unknown",
        auto_now=True,
        db_index=True,
    )
    type = models.CharField(
        max_length=20,
        choices=OPINION_TYPES,
    )
    sha1 = models.CharField(
        help_text="unique ID for the document, as generated via SHA1 of the "
        "binary file or text data",
        max_length=40,
        db_index=True,
    )
    download_url = models.URLField(
        help_text="The URL on the court website where the document was "
        "originally scraped",
        max_length=500,
        db_index=True,
        null=True,
        blank=True,
    )
    local_path = models.FileField(
        help_text="The location, relative to MEDIA_ROOT on the CourtListener "
        "server, where files are stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        blank=True,
        db_index=True)
    plain_text = models.TextField(
        help_text="Plain text of the document after extraction using "
        "pdftotext, wpd2txt, etc.",
        blank=True)
    html = models.TextField(
        help_text="HTML of the document, if available in the original",
        blank=True,
        null=True,
    )
    html_lawbox = models.TextField(
        help_text='HTML of Lawbox documents',
        blank=True,
        null=True,
    )
    html_columbia = models.TextField(
        help_text='HTML of Columbia archive',
        blank=True,
        null=True,
    )
    html_with_citations = models.TextField(
        help_text="HTML of the document with citation links and other "
        "post-processed markup added",
        blank=True)
    extracted_by_ocr = models.BooleanField(
        help_text='Whether OCR was used to get this document content',
        default=False,
        db_index=True,
    )

    @property
    def siblings(self):
        return self.cluster.sub_opinions

    def __unicode__(self):
        try:
            return u"{pk} - {cn}".format(
                pk=getattr(self, 'pk', None),
                cn=self.cluster.case_name,
            )
        except AttributeError:
            return u'Orphan opinion with ID: %s' % self.pk

    def get_absolute_url(self):
        return reverse('view_case', args=[self.cluster.pk, self.cluster.slug])

    def clean(self):
        if self.type == '':
            raise ValidationError("'type' is a required field.")

    def save(self, index=True, force_commit=False, *args, **kwargs):
        super(Opinion, self).save(*args, **kwargs)
        if index:
            from cl.search.tasks import add_or_update_opinions

            add_or_update_opinions.delay([self.pk], force_commit)
Example #4
0
class Audio(models.Model):
    """A class representing oral arguments and their associated metadata

    """

    STT_NEEDED = 0
    STT_COMPLETE = 1
    STT_FAILED = 2
    STT_STATUSES = (
        (STT_NEEDED, "Speech to Text Needed"),
        (STT_COMPLETE, "Speech to Text Complete"),
        (STT_FAILED, "Speech to Text Failed"),
    )
    docket = models.ForeignKey(
        Docket,
        help_text="The docket that the oral argument is a part of",
        related_name="audio_files",
        on_delete=models.CASCADE,
        blank=True,
        null=True,
    )
    source = models.CharField(
        help_text="the source of the audio file, one of: %s" %
        ", ".join(["%s (%s)" % (t[0], t[1]) for t in SOURCES]),
        max_length=10,
        choices=SOURCES,
        blank=True,
    )
    case_name_short = models.TextField(
        help_text="The abridged name of the case, often a single word, e.g. "
        "'Marsh'",
        blank=True,
    )
    case_name = models.TextField(
        help_text="The full name of the case",
        blank=True,
    )
    case_name_full = models.TextField(help_text="The full name of the case",
                                      blank=True)
    panel = models.ManyToManyField(
        Person,
        help_text="The judges that heard the oral arguments",
        related_name="oral_argument_panel_members",
        blank=True,
    )
    judges = models.TextField(
        help_text="The judges that heard the oral arguments as a simple text "
        "string. This field is used when normalized judges cannot "
        "be placed into the panel field.",
        blank=True,
        null=True,
    )
    date_created = models.DateTimeField(
        help_text="The original creation date for the item",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified. A value in year"
        " 1750 indicates the value is unknown",
        auto_now=True,
        db_index=True,
    )
    sha1 = models.CharField(
        help_text="unique ID for the document, as generated via SHA1 of the "
        "binary file or text data",
        max_length=40,
        db_index=True,
    )
    download_url = models.URLField(
        help_text="The URL on the court website where the document was "
        "originally scraped",
        max_length=500,
        db_index=True,
        null=True,
        blank=True,
    )
    local_path_mp3 = models.FileField(
        help_text="The location, relative to MEDIA_ROOT, on the CourtListener "
        "server, where encoded file is stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        blank=True,
        db_index=True,
    )
    local_path_original_file = models.FileField(
        help_text="The location, relative to MEDIA_ROOT, on the CourtListener "
        "server, where the original file is stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        db_index=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    ia_upload_failure_count = models.SmallIntegerField(
        help_text="Number of times the upload to the Internet Archive failed.",
        null=True,
        blank=True,
    )
    duration = models.SmallIntegerField(
        help_text="the length of the item, in seconds",
        null=True,
    )
    processing_complete = models.BooleanField(
        help_text="Is audio for this item done processing?",
        default=False,
    )
    date_blocked = models.DateField(
        help_text="The date that this opinion was blocked from indexing by "
        "search engines",
        blank=True,
        null=True,
        db_index=True,
    )
    blocked = models.BooleanField(
        help_text="Should this item be blocked from indexing by "
        "search engines?",
        db_index=True,
        default=False,
    )
    stt_status = models.SmallIntegerField(
        u"Speech to text status",
        help_text="The status of the Speech to Text for this item?",
        choices=STT_STATUSES,
        default=STT_NEEDED,
    )
    stt_google_response = models.TextField(
        u"Speech to text Google response",
        help_text="The JSON response object returned by Google Speech.",
        blank=True,
    )

    @property
    def transcript(self):
        j = json.loads(self.stt_google_response)
        # Find the alternative with the highest confidence for every utterance
        # in the results.
        best_utterances = []
        for utterance in j["response"]["results"]:
            best_confidence = 0
            for alt in utterance["alternatives"]:
                current_confidence = alt.get("confidence", 0)
                if current_confidence > best_confidence:
                    best_transcript = alt["transcript"]
                    best_confidence = current_confidence
            best_utterances.append(best_transcript)
        return " ".join(best_utterances)

    class Meta:
        ordering = ["-date_created"]
        verbose_name_plural = "Audio Files"

    def __unicode__(self):
        return u"%s: %s" % (self.pk, self.case_name)

    def get_absolute_url(self):
        return reverse("view_audio_file", args=[self.pk, self.docket.slug])

    def save(self, index=True, force_commit=False, *args, **kwargs):
        """
        Overrides the normal save method, but provides integration with the
        bulk files and with Solr indexing.

        :param index: Should the item be added to the Solr index?
        :param force_commit: Should a commit be performed in solr after
        indexing it?
        """
        super(Audio, self).save(*args, **kwargs)
        if index:
            from cl.search.tasks import add_items_to_solr

            add_items_to_solr([self.pk], "audio.Audio", force_commit)

    def delete(self, *args, **kwargs):
        """
        Update the index as items are deleted.
        """
        id_cache = self.pk
        super(Audio, self).delete(*args, **kwargs)
        from cl.search.tasks import delete_items

        delete_items.delay([id_cache], "audio.Audio")

    def as_search_dict(self):
        """Create a dict that can be ingested by Solr"""
        # IDs
        out = {
            "id": self.pk,
            "docket_id": self.docket_id,
            "court_id": self.docket.court_id,
        }

        # Docket
        docket = {"docketNumber": self.docket.docket_number}
        if self.docket.date_argued is not None:
            docket["dateArgued"] = midnight_pst(self.docket.date_argued)
        if self.docket.date_reargued is not None:
            docket["dateReargued"] = midnight_pst(self.docket.date_reargued)
        if self.docket.date_reargument_denied is not None:
            docket["dateReargumentDenied"] = midnight_pst(
                self.docket.date_reargument_denied)
        out.update(docket)

        # Court
        out.update({
            "court": self.docket.court.full_name,
            "court_citation_string": self.docket.court.citation_string,
            "court_exact": self.docket.court_id,  # For faceting
        })

        # Audio File
        out.update({
            "caseName":
            best_case_name(self),
            "panel_ids": [judge.pk for judge in self.panel.all()],
            "judge":
            self.judges,
            "file_size_mp3":
            deepgetattr(self, "local_path_mp3.size", None),
            "duration":
            self.duration,
            "source":
            self.source,
            "download_url":
            self.download_url,
            "local_path":
            unicode(getattr(self, "local_path_mp3", None)),
        })
        try:
            out["absolute_url"] = self.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s" %
                self.pk)

        text_template = loader.get_template("indexes/audio_text.txt")
        out["text"] = text_template.render({"item": self}).translate(null_map)

        return normalize_search_dicts(out)
Example #5
0
class Docket(models.Model):
    """A class to sit above OpinionClusters, Audio files, and Docket Entries,
    and link them together.
    """
    # The source values are additive. That is, if you get content from a new
    # source, you can add it to the previous one, and have a combined value.
    # For example, if you start with a RECAP docket (1), then add scraped
    # content (2), you can arrive at a combined docket (3) because 1 + 2 = 3.
    DEFAULT = 0
    RECAP = 1
    SCRAPER = 2
    RECAP_AND_SCRAPER = 3
    SOURCE_CHOICES = (
        (DEFAULT, "Default"),
        (RECAP, "RECAP"),
        (SCRAPER, "Scraper"),
        (RECAP_AND_SCRAPER, "RECAP and Scraper"),
    )

    source = models.SmallIntegerField(
        help_text="contains the source of the Docket.",
        choices=SOURCE_CHOICES,
    )
    court = models.ForeignKey(
        'Court',
        help_text="The court where the docket was filed",
        db_index=True,
        related_name='dockets',
    )
    assigned_to = models.ForeignKey(
        'people_db.Person',
        related_name='assigning',
        help_text="The judge the case was assigned to.",
        null=True,
        blank=True,
    )
    assigned_to_str = models.TextField(
        help_text="The judge that the case was assigned to, as a string.",
        blank=True,
    )
    referred_to = models.ForeignKey(
        'people_db.Person',
        related_name='referring',
        help_text="The judge to whom the 'assigned_to' judge is delegated.",
        null=True,
        blank=True,
    )
    referred_to_str = models.TextField(
        help_text="The judge that the case was referred to, as a string.",
        blank=True,
    )
    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified. A value in "
        "year 1750 indicates the value is unknown",
        auto_now=True,
        db_index=True,
    )
    date_cert_granted = models.DateField(
        help_text="date cert was granted for this case, if applicable",
        blank=True,
        null=True,
        db_index=True,
    )
    date_cert_denied = models.DateField(
        help_text="the date cert was denied for this case, if applicable",
        blank=True,
        null=True,
        db_index=True,
    )
    date_argued = models.DateField(
        help_text="the date the case was argued",
        blank=True,
        null=True,
        db_index=True,
    )
    date_reargued = models.DateField(
        help_text="the date the case was reargued",
        blank=True,
        null=True,
        db_index=True,
    )
    date_reargument_denied = models.DateField(
        help_text="the date the reargument was denied",
        blank=True,
        null=True,
        db_index=True,
    )
    date_filed = models.DateField(
        help_text="The date the case was filed.",
        blank=True,
        null=True,
    )
    date_terminated = models.DateField(
        help_text="The date the case was terminated.",
        blank=True,
        null=True,
    )
    date_last_filing = models.DateField(
        help_text="The date the case was last updated in the docket. ",
        blank=True,
        null=True,
    )
    case_name_short = models.TextField(
        help_text="The abridged name of the case, often a single word, e.g. "
        "'Marsh'",
        blank=True,
    )
    case_name = models.TextField(
        help_text="The standard name of the case",
        blank=True,
    )
    case_name_full = models.TextField(
        help_text="The full name of the case",
        blank=True,
    )
    slug = models.SlugField(
        help_text="URL that the document should map to (the slug)",
        max_length=75,
        db_index=False,
        blank=True,
    )
    docket_number = models.CharField(
        help_text="The docket numbers of a case, can be consolidated and "
        "quite long",
        max_length=5000,  # was 50, 100, 300, 1000
        blank=True,
        db_index=True,
    )
    pacer_case_id = models.CharField(
        help_text="The cased ID provided by PACER.",
        max_length=100,
        blank=True,
        db_index=True,
    )
    cause = models.CharField(
        help_text="The cause for the case.",
        max_length=200,
        blank=True,
    )
    nature_of_suit = models.CharField(
        help_text="The nature of suit code from PACER.",
        max_length=100,
        blank=True,
    )
    jury_demand = models.CharField(
        help_text="The compensation demand.",
        max_length=500,
        blank=True,
    )
    jurisdiction_type = models.CharField(
        help_text="Stands for jurisdiction in RECAP XML docket. For example, "
        "'Diversity', 'U.S. Government Defendant'.",
        max_length=100,
        blank=True,
    )
    filepath_local = models.FileField(
        help_text="Path to RECAP's Docket XML page.",
        upload_to=make_recap_path,
        storage=IncrementingFileSystemStorage(),
        max_length=1000,
        blank=True,
    )
    filepath_ia = models.CharField(
        help_text="Path to the Docket XML page in The Internet Archive",
        max_length=1000,
        blank=True,
    )
    date_blocked = models.DateField(
        help_text="The date that this opinion was blocked from indexing by "
        "search engines",
        blank=True,
        null=True,
        db_index=True,
    )
    blocked = models.BooleanField(
        help_text="Whether a document should be blocked from indexing by "
        "search engines",
        db_index=True,
        default=False,
    )

    def __unicode__(self):
        if self.case_name:
            return smart_unicode('%s: %s' % (self.pk, self.case_name))
        else:
            return u'{pk}'.format(pk=self.pk)

    def save(self, *args, **kwargs):
        self.slug = slugify(trunc(best_case_name(self), 75))
        if self.source == 1 and not self.pacer_case_id:
            raise ValidationError("pacer_case_id cannot be Null or empty in "
                                  "RECAP documents.")

        super(Docket, self).save(*args, **kwargs)

    def get_absolute_url(self):
        return reverse('view_docket', args=[self.pk, self.slug])
Example #6
0
class RECAPDocument(models.Model):
    """
        The model for Docket Documents and Attachments.
    """

    PACER_DOCUMENT = 1
    ATTACHMENT = 2
    DOCUMENT_TYPES = (
        (PACER_DOCUMENT, "PACER Document"),
        (ATTACHMENT, "Attachment"),
    )
    docket_entry = models.ForeignKey(
        DocketEntry,
        help_text="Foreign Key to the DocketEntry object to which it belongs. "
        "Multiple documents can belong to a DocketEntry. "
        "(Attachments and Documents together)",
        related_name="recap_documents",
    )
    date_created = models.DateTimeField(
        help_text="The date the file was imported to Local Storage.",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="Timestamp of last update.",
        auto_now=True,
        db_index=True,
    )
    date_upload = models.DateTimeField(
        help_text="upload_date in RECAP. The date the file was uploaded to "
        "RECAP. This information is provided by RECAP.",
        blank=True,
        null=True,
    )
    document_type = models.IntegerField(
        help_text="Whether this is a regular document or an attachment.",
        db_index=True,
        choices=DOCUMENT_TYPES,
    )
    document_number = models.PositiveIntegerField(
        help_text="If the file is a document, the number is the "
        "document_number in RECAP docket.", )
    attachment_number = models.SmallIntegerField(
        help_text="If the file is an attachment, the number is the attachment "
        "number in RECAP docket.",
        blank=True,
        null=True,
    )
    pacer_doc_id = models.CharField(
        help_text="The ID of the document in PACER. This information is "
        "provided by RECAP.",
        max_length=32,  # Same as in RECAP
        unique=True,
    )
    is_available = models.NullBooleanField(
        help_text="True if the item is available in RECAP",
        blank=True,
        null=True,
        default=False,
    )
    sha1 = models.CharField(
        help_text="The ID used for a document in RECAP",
        max_length=40,  # As in RECAP
        blank=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the file in the local storage area.",
        upload_to=make_recap_path,
        storage=IncrementingFileSystemStorage(),
        max_length=1000,
        blank=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    description = models.TextField(
        help_text="The short description of the docket entry that appears on "
        "the attachments page.",
        blank=True,
    )

    class Meta:
        unique_together = ('docket_entry', 'document_number',
                           'attachment_number')
        ordering = ('document_number', 'attachment_number')

    def __unicode__(self):
        return "Docket_%s , document_number_%s , attachment_number_%s" % (
            self.docket_entry.docket.docket_number, self.document_number,
            self.attachment_number)

    def save(self, *args, **kwargs):
        if self.document_type == self.ATTACHMENT:
            if self.attachment_number is None:
                raise ValidationError(
                    'attachment_number cannot be null for an '
                    'attachment.')

        super(RECAPDocument, self).save(*args, **kwargs)
Example #7
0
class RECAPDocument(models.Model):
    """
        The model for Docket Documents and Attachments.
    """
    PACER_DOCUMENT = 1
    ATTACHMENT = 2
    DOCUMENT_TYPES = (
        (PACER_DOCUMENT, "PACER Document"),
        (ATTACHMENT, "Attachment"),
    )
    OCR_COMPLETE = 1
    OCR_UNNECESSARY = 2
    OCR_FAILED = 3
    OCR_NEEDED = 4
    OCR_STATUSES = (
        (OCR_COMPLETE, "OCR Complete"),
        (OCR_UNNECESSARY, "OCR Not Necessary"),
        (OCR_FAILED, "OCR Failed"),
        (OCR_NEEDED, "OCR Needed"),
    )
    docket_entry = models.ForeignKey(
        DocketEntry,
        help_text="Foreign Key to the DocketEntry object to which it belongs. "
        "Multiple documents can belong to a DocketEntry. "
        "(Attachments and Documents together)",
        related_name="recap_documents",
    )
    date_created = models.DateTimeField(
        help_text="The date the file was imported to Local Storage.",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="Timestamp of last update.",
        auto_now=True,
        db_index=True,
    )
    date_upload = models.DateTimeField(
        help_text="upload_date in RECAP. The date the file was uploaded to "
        "RECAP. This information is provided by RECAP.",
        blank=True,
        null=True,
    )
    document_type = models.IntegerField(
        help_text="Whether this is a regular document or an attachment.",
        db_index=True,
        choices=DOCUMENT_TYPES,
    )
    document_number = models.BigIntegerField(
        help_text="If the file is a document, the number is the "
        "document_number in RECAP docket.", )
    attachment_number = models.SmallIntegerField(
        help_text="If the file is an attachment, the number is the attachment "
        "number in RECAP docket.",
        blank=True,
        null=True,
    )
    pacer_doc_id = models.CharField(
        help_text="The ID of the document in PACER. This information is "
        "provided by RECAP.",
        max_length=32,  # Same as in RECAP
        unique=True,
        null=True,
    )
    is_available = models.NullBooleanField(
        help_text="True if the item is available in RECAP",
        blank=True,
        null=True,
        default=False,
    )
    sha1 = models.CharField(
        help_text="The ID used for a document in RECAP",
        max_length=40,  # As in RECAP
        blank=True,
    )
    page_count = models.IntegerField(
        help_text="The number of pages in the document, if known",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the file in the local storage area.",
        upload_to=make_recap_path,
        storage=IncrementingFileSystemStorage(),
        max_length=1000,
        blank=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    description = models.TextField(
        help_text="The short description of the docket entry that appears on "
        "the attachments page.",
        blank=True,
    )
    plain_text = models.TextField(
        help_text="Plain text of the document after extraction using "
        "pdftotext, wpd2txt, etc.",
        blank=True,
    )
    ocr_status = models.SmallIntegerField(
        help_text="The status of OCR processing on this item.",
        choices=OCR_STATUSES,
        null=True,
        blank=True,
    )

    class Meta:
        unique_together = ('docket_entry', 'document_number',
                           'attachment_number')
        ordering = ('document_number', 'attachment_number')

    def __unicode__(self):
        return "%s: Docket_%s , document_number_%s , attachment_number_%s" % (
            self.pk, self.docket_entry.docket.docket_number,
            self.document_number, self.attachment_number)

    def get_absolute_url(self):
        if self.document_type == self.PACER_DOCUMENT:
            return reverse('view_recap_document',
                           kwargs={
                               'docket_id': self.docket_entry.docket.pk,
                               'doc_num': self.document_number,
                               'slug': self.docket_entry.docket.slug,
                           })
        elif self.document_type == self.ATTACHMENT:
            return reverse('view_recap_attachment',
                           kwargs={
                               'docket_id': self.docket_entry.docket.pk,
                               'doc_num': self.document_number,
                               'att_num': self.attachment_number,
                               'slug': self.docket_entry.docket.slug,
                           })

    @property
    def pacer_url(self):
        """Construct a doc1 URL for any item, if we can. Else, return None."""
        from cl.lib.pacer import cl_to_pacer_ids
        if self.pacer_doc_id:
            court_id = self.docket_entry.docket.court.pk
            if court_id in cl_to_pacer_ids:
                court_id = cl_to_pacer_ids[court_id]
            return "https://ecf.%s.uscourts.gov/doc1/%s" % (court_id,
                                                            self.pacer_doc_id)
        else:
            return self.docket_entry.docket.pacer_url

    @property
    def needs_extraction(self):
        """Does the item need extraction and does it have all the right
        fields?
        """
        return bool(
            all([
                self.ocr_status is None,
                self.is_available is True,
                bool(self.filepath_local.name),  # Just in case
            ]))

    def save(self, do_extraction=True, index=True, *args, **kwargs):
        if self.document_type == self.ATTACHMENT:
            if self.attachment_number is None:
                raise ValidationError(
                    'attachment_number cannot be null for an '
                    'attachment.')
        if self.pacer_doc_id == '':
            # Normally a char field would be never have a null value, opting
            # instead on having a blank value. However, blanks are not
            # considered unique while nulls are, so for this field, we reset
            # it to null whenever it would normally be blank.
            # http://stackoverflow.com/a/3124586/64911
            self.pacer_doc_id = None

        super(RECAPDocument, self).save(*args, **kwargs)
        tasks = []
        if do_extraction and self.needs_extraction:
            # Context extraction not done and is requested.
            from cl.scrapers.tasks import extract_recap_pdf
            tasks.append(extract_recap_pdf.si(self.pk))
        if index:
            from cl.search.tasks import add_or_update_recap_document
            tasks.append(add_or_update_recap_document.si([self.pk], False))
        if len(tasks) > 0:
            chain(*tasks)()

    def delete(self, *args, **kwargs):
        """
        Note that this doesn't get called when an entire queryset
        is deleted, but that should be OK.
        """
        id_cache = self.pk
        super(RECAPDocument, self).delete(*args, **kwargs)
        from cl.search.tasks import delete_items
        delete_items.delay([id_cache], settings.SOLR_RECAP_URL)

    def as_search_dict(self):
        """Create a dict that can be ingested by Solr.

        Search results are presented as Dockets, but they're indexed as
        RECAPDocument's, which are then grouped back together in search results
        to form Dockets.
        """
        # IDs
        out = {
            'id':
            self.pk,
            'docket_entry_id':
            self.docket_entry.pk,
            'docket_id':
            self.docket_entry.docket.pk,
            'court_id':
            self.docket_entry.docket.court.pk,
            'assigned_to_id':
            getattr(self.docket_entry.docket.assigned_to, 'pk', None),
            'referred_to_id':
            getattr(self.docket_entry.docket.referred_to, 'pk', None)
        }

        # RECAPDocument
        out.update({
            'short_description': self.description,
            'document_type': self.get_document_type_display(),
            'document_number': self.document_number,
            'attachment_number': self.attachment_number,
            'is_available': self.is_available,
            'page_count': self.page_count,
        })
        if hasattr(self.filepath_local, 'path'):
            out['filepath_local'] = self.filepath_local.path

        # Docket Entry
        out['description'] = self.docket_entry.description
        if self.docket_entry.entry_number is not None:
            out['entry_number'] = self.docket_entry.entry_number
        if self.docket_entry.date_filed is not None:
            out['entry_date_filed'] = datetime.combine(
                self.docket_entry.date_filed, time())

        # Docket
        out.update({
            'docketNumber':
            self.docket_entry.docket.docket_number,
            'caseName':
            best_case_name(self.docket_entry.docket),
            'suitNature':
            self.docket_entry.docket.nature_of_suit,
            'cause':
            self.docket_entry.docket.cause,
            'juryDemand':
            self.docket_entry.docket.jury_demand,
            'jurisdictionType':
            self.docket_entry.docket.jurisdiction_type,
        })
        if self.docket_entry.docket.date_argued is not None:
            out['dateArgued'] = datetime.combine(
                self.docket_entry.docket.date_argued, time())
        if self.docket_entry.docket.date_filed is not None:
            out['dateFiled'] = datetime.combine(
                self.docket_entry.docket.date_filed, time())
        if self.docket_entry.docket.date_terminated is not None:
            out['dateTerminated'] = datetime.combine(
                self.docket_entry.docket.date_terminated, time())
        try:
            out['absolute_url'] = self.get_absolute_url()
            out['docket_absolute_url'] = self.docket_entry.docket.get_absolute_url(
            )
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url: %s" %
                self.pk)

        # Judges
        if self.docket_entry.docket.assigned_to is not None:
            out['assignedTo'] = self.docket_entry.docket.assigned_to.name_full
        elif self.docket_entry.docket.assigned_to_str is not None:
            out['assignedTo'] = self.docket_entry.docket.assigned_to_str
        if self.docket_entry.docket.referred_to is not None:
            out['referredTo'] = self.docket_entry.docket.referred_to.name_full
        elif self.docket_entry.docket.referred_to_str is not None:
            out['referredTo'] = self.docket_entry.docket.referred_to_str

        # Court
        out.update({
            'court':
            self.docket_entry.docket.court.full_name,
            'court_exact':
            self.docket_entry.docket.court_id,  # For faceting
            'court_citation_string':
            self.docket_entry.docket.court.citation_string
        })

        text_template = loader.get_template('indexes/dockets_text.txt')
        out['text'] = text_template.render({'item': self}).translate(null_map)

        return nuke_nones(out)
Example #8
0
class Opinion(models.Model):
    OPINION_TYPES = (
        ('010combined', 'Combined Opinion'),
        ('020lead', 'Lead Opinion'),
        ('030concurrence', 'Concurrence'),
        ('040dissent', 'Dissent'),
        ('050addendum', 'Addendum'),
    )
    cluster = models.ForeignKey(
        OpinionCluster,
        help_text="The cluster that the opinion is a part of",
        related_name="sub_opinions",
    )
    opinions_cited = models.ManyToManyField(
        'self',
        help_text="Opinions cited by this opinion",
        through='OpinionsCited',
        through_fields=('citing_opinion', 'cited_opinion'),
        symmetrical=False,
        related_name="opinions_citing",
        blank=True,
    )
    author = models.ForeignKey(
        'people_db.Person',
        help_text="The primary author of this opinion as a normalized field",
        related_name='opinions_written',
        blank=True,
        null=True,
    )
    author_str = models.TextField(
        help_text="The primary author of this opinion, as a simple text "
        "string. This field is used when normalized judges cannot "
        "be placed into the author field.",
        blank=True,
    )
    per_curiam = models.BooleanField(
        help_text="Is this opinion per curiam, without a single author?",
        default=False,
    )
    joined_by = models.ManyToManyField(
        'people_db.Person',
        related_name='opinions_joined',
        help_text="Other judges that joined the primary author in this opinion",
        blank=True,
    )
    date_created = models.DateTimeField(
        help_text="The original creation date for the item",
        auto_now_add=True,
        db_index=True)
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified. A value in "
        "year 1750 indicates the value is unknown",
        auto_now=True,
        db_index=True,
    )
    type = models.CharField(
        max_length=20,
        choices=OPINION_TYPES,
    )
    sha1 = models.CharField(
        help_text="unique ID for the document, as generated via SHA1 of the "
        "binary file or text data",
        max_length=40,
        db_index=True,
    )
    page_count = models.IntegerField(
        help_text="The number of pages in the document, if known",
        blank=True,
        null=True,
    )
    download_url = models.URLField(
        help_text="The URL on the court website where the document was "
        "originally scraped",
        max_length=500,
        db_index=True,
        null=True,
        blank=True,
    )
    local_path = models.FileField(
        help_text="The location, relative to MEDIA_ROOT on the CourtListener "
        "server, where files are stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        blank=True,
        db_index=True)
    plain_text = models.TextField(
        help_text="Plain text of the document after extraction using "
        "pdftotext, wpd2txt, etc.",
        blank=True)
    html = models.TextField(
        help_text="HTML of the document, if available in the original",
        blank=True,
        null=True,
    )
    html_lawbox = models.TextField(
        help_text='HTML of Lawbox documents',
        blank=True,
        null=True,
    )
    html_columbia = models.TextField(
        help_text='HTML of Columbia archive',
        blank=True,
        null=True,
    )
    html_with_citations = models.TextField(
        help_text="HTML of the document with citation links and other "
        "post-processed markup added",
        blank=True)
    extracted_by_ocr = models.BooleanField(
        help_text='Whether OCR was used to get this document content',
        default=False,
        db_index=True,
    )

    @property
    def siblings(self):
        # These are other sub-opinions of the current cluster.
        return self.cluster.sub_opinions

    def __unicode__(self):
        try:
            return u"{pk} - {cn}".format(
                pk=getattr(self, 'pk', None),
                cn=self.cluster.case_name,
            )
        except AttributeError:
            return u'Orphan opinion with ID: %s' % self.pk

    def get_absolute_url(self):
        return reverse('view_case', args=[self.cluster.pk, self.cluster.slug])

    def clean(self):
        if self.type == '':
            raise ValidationError("'type' is a required field.")

    def save(self, index=True, force_commit=False, *args, **kwargs):
        super(Opinion, self).save(*args, **kwargs)
        if index:
            from cl.search.tasks import add_or_update_opinions
            add_or_update_opinions.delay([self.pk], force_commit)

    def as_search_dict(self):
        """Create a dict that can be ingested by Solr."""
        # IDs
        out = {
            'id': self.pk,
            'docket_id': self.cluster.docket.pk,
            'cluster_id': self.cluster.pk,
            'court_id': self.cluster.docket.court.pk
        }

        # Opinion
        out.update({
            'cites': [opinion.pk for opinion in self.opinions_cited.all()],
            'author_id':
            getattr(self.author, 'pk', None),
            # 'per_curiam': self.per_curiam,
            'joined_by_ids': [judge.pk for judge in self.joined_by.all()],
            'type':
            self.type,
            'download_url':
            self.download_url or None,
            'local_path':
            unicode(self.local_path),
        })

        # Cluster
        out.update({
            'caseName':
            best_case_name(self.cluster),
            'caseNameShort':
            self.cluster.case_name_short,
            'sibling_ids': [sibling.pk for sibling in self.siblings.all()],
            'panel_ids': [judge.pk for judge in self.cluster.panel.all()],
            'non_participating_judge_ids': [
                judge.pk
                for judge in self.cluster.non_participating_judges.all()
            ],
            'judge':
            self.cluster.judges,
            'lexisCite':
            self.cluster.lexis_cite,
            'citation': [cite for cite in self.cluster.citation_list
                         if cite],  # Nuke '' and None
            'neutralCite':
            self.cluster.neutral_cite,
            'scdb_id':
            self.cluster.scdb_id,
            'source':
            self.cluster.source,
            'attorney':
            self.cluster.attorneys,
            'suitNature':
            self.cluster.nature_of_suit,
            'citeCount':
            self.cluster.citation_count,
            'status':
            self.cluster.get_precedential_status_display(),
            'status_exact':
            self.cluster.get_precedential_status_display(),
        })
        if self.cluster.date_filed is not None:
            out['dateFiled'] = datetime.combine(self.cluster.date_filed,
                                                time())  # Midnight, PST
        try:
            out['absolute_url'] = self.cluster.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                "Unable to save to index due to missing absolute_url "
                "(court_id: %s, item.pk: %s). Might the court have in_use set "
                "to False?" % (self.cluster.docket.court_id, self.pk))

        # Docket
        docket = {'docketNumber': self.cluster.docket.docket_number}
        if self.cluster.docket.date_argued is not None:
            docket['dateArgued'] = datetime.combine(
                self.cluster.docket.date_argued,
                time(),
            )
        if self.cluster.docket.date_reargued is not None:
            docket['dateReargued'] = datetime.combine(
                self.cluster.docket.date_reargued,
                time(),
            )
        if self.cluster.docket.date_reargument_denied is not None:
            docket['dateReargumentDenied'] = datetime.combine(
                self.cluster.docket.date_reargument_denied,
                time(),
            )
        out.update(docket)

        court = {
            'court': self.cluster.docket.court.full_name,
            'court_citation_string': self.cluster.docket.court.citation_string,
            'court_exact': self.cluster.docket.court_id,  # For faceting
        }
        out.update(court)

        # Load the document text using a template for cleanup and concatenation
        text_template = loader.get_template('indexes/opinion_text.txt')
        out['text'] = text_template.render({
            'item':
            self,
            'citation_string':
            self.cluster.citation_string
        }).translate(null_map)

        return nuke_nones(out)
Example #9
0
class Audio(models.Model):
    """A class representing oral arguments and their associated metadata

    """
    docket = models.ForeignKey(
        Docket,
        help_text="The docket that the oral argument is a part of",
        related_name="audio_files",
        blank=True,
        null=True,
    )
    source = models.CharField(
        help_text="the source of the audio file, one of: %s" %
        ', '.join(['%s (%s)' % (t[0], t[1]) for t in SOURCES]),
        max_length=10,
        choices=SOURCES,
        blank=True,
    )
    case_name_short = models.TextField(
        help_text="The abridged name of the case, often a single word, e.g. "
        "'Marsh'",
        blank=True,
    )
    case_name = models.TextField(
        help_text="The full name of the case",
        blank=True,
    )
    case_name_full = models.TextField(help_text="The full name of the case",
                                      blank=True)
    panel = models.ManyToManyField(
        Person,
        help_text="The judges that heard the oral arguments",
        related_name="oral_argument_panel_members",
        blank=True,
    )
    judges = models.TextField(
        help_text="The judges that heard the oral arguments as a simple text "
        "string. This field is used when normalized judges cannot "
        "be placed into the panel field.",
        blank=True,
        null=True,
    )
    date_created = models.DateTimeField(
        help_text="The original creation date for the item",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified. A value in year"
        " 1750 indicates the value is unknown",
        auto_now=True,
        db_index=True,
    )
    sha1 = models.CharField(
        help_text="unique ID for the document, as generated via SHA1 of the "
        "binary file or text data",
        max_length=40,
        db_index=True,
    )
    download_url = models.URLField(
        help_text="The URL on the court website where the document was "
        "originally scraped",
        max_length=500,
        db_index=True,
        null=True,
        blank=True,
    )
    local_path_mp3 = models.FileField(
        help_text="The location, relative to MEDIA_ROOT, on the CourtListener "
        "server, where encoded file is stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        blank=True,
        db_index=True,
    )
    local_path_original_file = models.FileField(
        help_text="The location, relative to MEDIA_ROOT, on the CourtListener "
        "server, where the original file is stored",
        upload_to=make_upload_path,
        storage=IncrementingFileSystemStorage(),
        db_index=True,
    )
    duration = models.SmallIntegerField(
        help_text="the length of the item, in seconds",
        null=True,
    )
    processing_complete = models.BooleanField(
        help_text="Is audio for this item done processing?",
        default=False,
    )
    date_blocked = models.DateField(
        help_text="The date that this opinion was blocked from indexing by "
        "search engines",
        blank=True,
        null=True,
        db_index=True,
    )
    blocked = models.BooleanField(
        help_text="Should this item be blocked from indexing by "
        "search engines?",
        db_index=True,
        default=False,
    )

    class Meta:
        ordering = ["-date_created"]
        verbose_name_plural = 'Audio Files'

    def __unicode__(self):
        return u'%s: %s' % (self.pk, self.case_name)

    def get_absolute_url(self):
        return reverse('view_audio_file', args=[self.pk, self.docket.slug])

    def save(self, index=True, force_commit=False, *args, **kwargs):
        """
        Overrides the normal save method, but provides integration with the
        bulk files and with Solr indexing.

        :param index: Should the item be added to the Solr index?
        :param commit: Should a commit be performed after adding it?
        """
        super(Audio, self).save(*args, **kwargs)
        if index:
            from cl.search.tasks import add_or_update_audio_files
            add_or_update_audio_files.delay([self.pk], force_commit)

    def delete(self, *args, **kwargs):
        """
        Update the index as items are deleted.
        """
        id_cache = self.pk
        super(Audio, self).delete(*args, **kwargs)
        from cl.search.tasks import delete_items
        delete_items.delay([id_cache], settings.SOLR_AUDIO_URL)