Beispiel #1
0
class LASCPDF(AbstractPDF, AbstractDateTimeModel):
    """Use the content framework to associate PDFs with our dockets"""

    content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
    object_id = models.PositiveIntegerField()
    content_object = GenericForeignKey()
    """
    Do we want to have a different file path generator <----
    """

    filepath_s3 = models.FileField(
        help_text="The path of the file in the s3 bucket.",
        upload_to=make_pdf_path,
        storage=IncrementingAWSMediaStorage(),
        max_length=150,
        blank=True,
    )
    docket_number = models.CharField(
        help_text="Docket number for the case. E.g. 19LBCV00507, "
        "19STCV28994, or even 30-2017-00900866-CU-AS-CJC.",
        max_length=300,
        db_index=True,
    )
    document_id = models.CharField(
        help_text="Internal Document Id",
        max_length=40,
        db_index=True,
    )

    class Meta:
        verbose_name = "LASC PDF"
Beispiel #2
0
class EmailProcessingQueue(AbstractDateTimeModel):
    uploader = models.ForeignKey(
        User,
        help_text="The user that sent in the email for processing.",
        related_name="recap_email_processing_queue",
        on_delete=models.CASCADE,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the upload was from",
        related_name="recap_email_processing_queue",
        on_delete=models.CASCADE,
    )
    filepath = models.FileField(
        help_text=
        "The S3 filepath to the email and receipt stored as JSON text.",
        upload_to=make_recap_email_processing_queue_aws_path,
        storage=IncrementingAWSMediaStorage(),
        max_length=300,
        null=True,
    )
    status = models.SmallIntegerField(
        help_text="The current status of this upload. Possible values "
        "are: %s" %
        ", ".join(["(%s): %s" % (t[0], t[1])
                   for t in PROCESSING_STATUS.NAMES]),
        default=PROCESSING_STATUS.ENQUEUED,
        choices=PROCESSING_STATUS.NAMES,
        db_index=True,
    )
    status_message = models.TextField(
        help_text="Any errors that occurred while processing an item",
        blank=True,
    )
    recap_documents = models.ManyToManyField(
        RECAPDocument,
        related_name="recap_email_processing_queue",
        help_text=
        "Document(s) created from the PACER email, processed as a function of this queue.",
    )

    def __str__(self) -> str:
        return f"EmailProcessingQueue: {self.pk} in court {self.court_id}"
Beispiel #3
0
class AbstractPDF(models.Model):
    """An abstract model to hold PDF-related information"""

    OCR_COMPLETE = 1
    OCR_UNNECESSARY = 2
    OCR_FAILED = 3
    OCR_NEEDED = 4
    OCR_STATUSES = (
        (OCR_COMPLETE, "OCR Complete"),
        (OCR_UNNECESSARY, "OCR Not Necessary"),
        (OCR_FAILED, "OCR Failed"),
        (OCR_NEEDED, "OCR Needed"),
    )
    sha1 = models.CharField(
        help_text="The ID used for a document in RECAP",
        max_length=40,  # As in RECAP
        blank=True,
    )
    page_count = models.IntegerField(
        help_text="The number of pages in the document, if known",
        blank=True,
        null=True,
    )
    file_size = models.IntegerField(
        help_text="The size of the file in bytes, if known",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text=f"The path is AWS S3 where the file is saved. "
        f"{s3_warning_note}",
        upload_to=make_pdf_path,
        storage=IncrementingAWSMediaStorage(),
        max_length=1000,
        db_index=True,
        blank=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    ia_upload_failure_count = models.SmallIntegerField(
        help_text="Number of times the upload to the Internet Archive failed.",
        null=True,
        blank=True,
    )
    thumbnail = models.FileField(
        help_text="The path to a thumbnail in S3 of the first page of the "
        "document.",
        upload_to=make_pdf_thumb_path,
        storage=IncrementingAWSMediaStorage(),
        null=True,
        blank=True,
    )
    thumbnail_status = models.SmallIntegerField(
        help_text="The status of the thumbnail generation",
        choices=THUMBNAIL_STATUSES.NAMES,
        default=THUMBNAIL_STATUSES.NEEDED,
    )
    plain_text = models.TextField(
        help_text="Plain text of the document after extraction using "
        "pdftotext, wpd2txt, etc.",
        blank=True,
    )
    ocr_status = models.SmallIntegerField(
        help_text="The status of OCR processing on this item.",
        choices=OCR_STATUSES,
        null=True,
        blank=True,
    )

    class Meta:
        abstract = True
Beispiel #4
0
class Audio(AbstractDateTimeModel):
    """A class representing oral arguments and their associated metadata"""

    STT_NEEDED = 0
    STT_COMPLETE = 1
    STT_FAILED = 2
    STT_STATUSES = (
        (STT_NEEDED, "Speech to Text Needed"),
        (STT_COMPLETE, "Speech to Text Complete"),
        (STT_FAILED, "Speech to Text Failed"),
    )
    # Annotation required b/c this FK is nullable, which breaks absolute_url
    docket: Docket = models.ForeignKey(
        Docket,
        help_text="The docket that the oral argument is a part of",
        related_name="audio_files",
        on_delete=models.CASCADE,
        blank=True,
        null=True,
    )
    source = models.CharField(
        help_text="the source of the audio file, one of: %s" %
        ", ".join(["%s (%s)" % (t[0], t[1]) for t in SOURCES]),
        max_length=10,
        choices=SOURCES,
        blank=True,
    )
    case_name_short = models.TextField(
        help_text="The abridged name of the case, often a single word, e.g. "
        "'Marsh'",
        blank=True,
    )
    case_name = models.TextField(
        help_text="The full name of the case",
        blank=True,
    )
    case_name_full = models.TextField(help_text="The full name of the case",
                                      blank=True)
    panel = models.ManyToManyField(
        Person,
        help_text="The judges that heard the oral arguments",
        related_name="oral_argument_panel_members",
        blank=True,
    )
    judges = models.TextField(
        help_text="The judges that heard the oral arguments as a simple text "
        "string. This field is used when normalized judges cannot "
        "be placed into the panel field.",
        blank=True,
        null=True,
    )
    sha1 = models.CharField(
        help_text="unique ID for the document, as generated via SHA1 of the "
        "binary file or text data",
        max_length=40,
        db_index=True,
    )
    download_url = models.URLField(
        help_text="The URL on the court website where the document was "
        "originally scraped",
        max_length=500,
        db_index=True,
        null=True,
        blank=True,
    )
    local_path_mp3 = models.FileField(
        help_text=f"The location in AWS S3 where our enhanced copy of the "
        f"original audio file is stored. {s3_warning_note}",
        upload_to=make_upload_path,
        storage=IncrementingAWSMediaStorage(),
        blank=True,
        db_index=True,
    )
    local_path_original_file = models.FileField(
        help_text=f"The location in AWS S3 where the original audio file "
        f"downloaded from the court is stored. {s3_warning_note}",
        upload_to=make_upload_path,
        storage=IncrementingAWSMediaStorage(),
        db_index=True,
    )
    filepath_ia = models.CharField(
        help_text="The URL of the file in IA",
        max_length=1000,
        blank=True,
    )
    ia_upload_failure_count = models.SmallIntegerField(
        help_text="Number of times the upload to the Internet Archive failed.",
        null=True,
        blank=True,
    )
    duration = models.SmallIntegerField(
        help_text="the length of the item, in seconds",
        null=True,
    )
    processing_complete = models.BooleanField(
        help_text="Is audio for this item done processing?",
        default=False,
    )
    date_blocked = models.DateField(
        help_text="The date that this opinion was blocked from indexing by "
        "search engines",
        blank=True,
        null=True,
        db_index=True,
    )
    blocked = models.BooleanField(
        help_text="Should this item be blocked from indexing by "
        "search engines?",
        db_index=True,
        default=False,
    )
    stt_status = models.SmallIntegerField(
        "Speech to text status",
        help_text="The status of the Speech to Text for this item?",
        choices=STT_STATUSES,
        default=STT_NEEDED,
    )
    stt_google_response = models.TextField(
        "Speech to text Google response",
        help_text="The JSON response object returned by Google Speech.",
        blank=True,
    )

    @property
    def transcript(self) -> str:
        j = json.loads(self.stt_google_response)
        # Find the alternative with the highest confidence for every utterance
        # in the results.
        best_utterances = []
        for utterance in j["response"]["results"]:
            best_confidence = 0
            for alt in utterance["alternatives"]:
                current_confidence = alt.get("confidence", 0)
                if current_confidence > best_confidence:
                    best_transcript = alt["transcript"]
                    best_confidence = current_confidence
            best_utterances.append(best_transcript)
        return " ".join(best_utterances)

    class Meta:
        ordering = ["-date_created"]
        verbose_name_plural = "Audio Files"

    def __str__(self) -> str:
        return f"{self.pk}: {self.case_name}"

    def get_absolute_url(self) -> str:
        return reverse("view_audio_file", args=[self.pk, self.docket.slug])

    def save(  # type: ignore[override]
        self,
        index: bool = True,
        force_commit: bool = False,
        *args: List,
        **kwargs: Dict,
    ) -> None:
        """
        Overrides the normal save method, but provides integration with the
        bulk files and with Solr indexing.

        :param index: Should the item be added to the Solr index?
        :param force_commit: Should a commit be performed in solr after
        indexing it?
        """
        super(Audio, self).save(*args, **kwargs)  # type: ignore
        if index:
            from cl.search.tasks import add_items_to_solr

            add_items_to_solr([self.pk], "audio.Audio", force_commit)

    def delete(  # type: ignore[override]
            self,
            *args: List,
            **kwargs: Dict,
    ) -> None:
        """
        Update the index as items are deleted.
        """
        id_cache = self.pk
        super(Audio, self).delete(*args, **kwargs)  # type: ignore
        from cl.search.tasks import delete_items

        delete_items.delay([id_cache], "audio.Audio")

    def as_search_dict(self) -> Dict[str, Union[int, List[int], str]]:
        """Create a dict that can be ingested by Solr"""
        # IDs
        out = {
            "id": self.pk,
            "docket_id": self.docket_id,
            "court_id": self.docket.court_id,
        }

        # Docket
        docket = {"docketNumber": self.docket.docket_number}
        if self.docket.date_argued is not None:
            docket["dateArgued"] = midnight_pst(self.docket.date_argued)
        if self.docket.date_reargued is not None:
            docket["dateReargued"] = midnight_pst(self.docket.date_reargued)
        if self.docket.date_reargument_denied is not None:
            docket["dateReargumentDenied"] = midnight_pst(
                self.docket.date_reargument_denied)
        out.update(docket)

        # Court
        out.update({
            "court": self.docket.court.full_name,
            "court_citation_string": self.docket.court.citation_string,
            "court_exact": self.docket.court_id,  # For faceting
        })

        # Audio File
        out.update({
            "caseName":
            best_case_name(self),
            "panel_ids": [judge.pk for judge in self.panel.all()],
            "judge":
            self.judges,
            "file_size_mp3":
            deepgetattr(self, "local_path_mp3.size", None),
            "duration":
            self.duration,
            "source":
            self.source,
            "download_url":
            self.download_url,
            "local_path":
            deepgetattr(self, "local_path_mp3.name", None),
        })
        try:
            out["absolute_url"] = self.get_absolute_url()
        except NoReverseMatch:
            raise InvalidDocumentError(
                f"Unable to save to index due to missing absolute_url: {self.pk}"
            )

        text_template = loader.get_template("indexes/audio_text.txt")
        out["text"] = text_template.render({"item": self}).translate(null_map)

        return normalize_search_dicts(out)
Beispiel #5
0
class FinancialDisclosure(AbstractDateTimeModel):
    """A simple table to hold references to financial disclosure forms"""

    person = models.ForeignKey(
        Person,
        help_text="The person that the document is associated with.",
        related_name="financial_disclosures",
        on_delete=models.CASCADE,
    )
    year = models.SmallIntegerField(
        help_text="The year that the disclosure corresponds with",
        db_index=True,
    )
    download_filepath = models.TextField(
        help_text="The path to the original file collected on aws. If "
        "split tiff, return url for page one of the disclosures", )
    filepath = models.FileField(
        help_text="The filepath to the disclosure normalized to a PDF.",
        upload_to=pdf_path,
        storage=IncrementingAWSMediaStorage(),
        max_length=300,
        db_index=True,
    )
    thumbnail = models.FileField(
        help_text="A thumbnail of the first page of the disclosure form.",
        upload_to=thumbnail_path,
        storage=IncrementingAWSMediaStorage(),
        max_length=300,
        null=True,
        blank=True,
    )
    thumbnail_status = models.SmallIntegerField(
        help_text="The status of the thumbnail generation",
        choices=THUMBNAIL_STATUSES.NAMES,
        default=THUMBNAIL_STATUSES.NEEDED,
    )
    page_count = models.SmallIntegerField(
        help_text="The number of pages in the disclosure report", )
    sha1 = models.CharField(
        help_text="SHA1 hash of the generated PDF",
        max_length=40,
        db_index=True,
        blank=True,
        unique=True,
    )
    report_type = models.SmallIntegerField(
        help_text="Financial Disclosure report type",
        choices=REPORT_TYPES.NAMES,
        default=REPORT_TYPES.UNKNOWN,
    )
    is_amended = models.BooleanField(
        help_text="Is disclosure amended?",
        default=False,
        null=True,
    )
    addendum_content_raw = models.TextField(
        help_text="Raw content of addendum with whitespace preserved.",
        blank=True,
    )
    addendum_redacted = models.BooleanField(
        help_text="Is the addendum partially or completely redacted?",
        default=False,
    )
    has_been_extracted = models.BooleanField(
        help_text="Have we successfully extracted the data from PDF?",
        default=False,
    )

    def __str__(self) -> str:
        return f"{self.pk}, person: {self.person_id}, year: {self.year}"

    def calculate_wealth(self, field_name: str) -> Dict[str, Union[str, int]]:
        """Calculate gross value of all investments in disclosure

        We can calculate the total investment for four fields

        ** gross_value_code - Gross Value total for the investments
        ** income_during_reporting_period_code - Gross Income
        ** transaction_gain_code  - Total Income gain
        ** transaction_value_code - Total Transaction values

        :param field_name: The field to process for the disclosure
        :return: Total value of investments for supplied field.
        """
        investments = self.investments.exclude(**{
            field_name: CODES.X
        }).exclude(**{field_name: ""})

        min_value, max_value = 0, 0
        for investment in investments:
            min_value += CODES.VALUES[getattr(investment, field_name)]["min"]
            max_value += CODES.VALUES[getattr(investment, field_name)]["max"]
        return {
            "min": min_value,
            "max": max_value,
            "miss_count": self.investments.filter(**{
                field_name: CODES.X
            }).count(),
        }

    def save(self, *args, **kwargs):
        super(FinancialDisclosure, self).save(*args, **kwargs)
        if self.thumbnail_status == THUMBNAIL_STATUSES.NEEDED:
            from cl.disclosures.tasks import (
                make_financial_disclosure_thumbnail_from_pdf, )

            make_financial_disclosure_thumbnail_from_pdf.delay(self.pk)

    class Meta:
        permissions = disclosure_permissions