Example #1
0
class PacerHtmlFiles(models.Model):
    """This is a simple object for holding original HTML content from PACER

    We use this object to make sure that for every item we receive from users,
    we can go back and re-parse it one day if we have to. This becomes essential
    as we do more and more data work where we're purchasing content. If we don't
    keep an original copy, a bug could be devastating.
    """
    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    filepath = models.FileField(
        help_text="The path of the original data from PACER.",
        upload_to=make_recap_data_path,
        storage=UUIDFileSystemStorage(),
        max_length=150,
    )
    content_type = models.ForeignKey(ContentType)
    object_id = models.PositiveIntegerField()
    content_object = GenericForeignKey()
Example #2
0
class RssFeedData(models.Model):
    """Store all old RSS data to disk for future analysis."""

    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the RSS feed was found",
        on_delete=models.CASCADE,
        related_name="rss_feed_data",
    )
    filepath = models.FileField(
        help_text="The path of the file in the local storage area.",
        upload_to=make_rss_feed_path,
        storage=UUIDFileSystemStorage(),
        max_length=150,
    )

    @property
    def file_contents(self):
        with open(self.filepath.path, "rb") as f:
            return bz2.decompress(f.read()).decode("utf-8")

    def print_file_contents(self):
        print(self.file_contents)
Example #3
0
class UUIDFileSystemStorageTest(SimpleTestCase):
    # Borrows from https://github.com/django/django/blob/9cbf48693dcd8df6cb22c183dcc94e7ce62b2921/tests/file_storage/tests.py#L89

    def setUp(self):
        self.temp_dir = tempfile.mkdtemp()
        self.storage = UUIDFileSystemStorage(location=self.temp_dir,
                                             base_url='test_uuid_storage')

    def test_file_save_with_path(self):
        """Does saving a pathname create directories and filenames correctly?"""
        self.assertFalse(self.storage.exists('path/to'))
        file_name = 'filename'
        extension = 'ext'
        f = self.storage.save('path/to/%s.%s' % (file_name, extension),
                              ContentFile('file with path'))
        self.assertTrue(self.storage.exists('path/to'))
        dir_name_created, file_name_created = os.path.split(f)
        file_root_created, extension_created = file_name_created.split('.', 1)
        self.assertEqual(extension_created, extension)
        self.assertTrue(re.match('[a-f0-9]{32}', file_root_created))
Example #4
0
class UUIDFileSystemStorageTest(SimpleTestCase):
    # Borrows from https://github.com/django/django/blob/9cbf48693dcd8df6cb22c183dcc94e7ce62b2921/tests/file_storage/tests.py#L89

    def setUp(self):
        self.temp_dir = tempfile.mkdtemp()
        self.storage = UUIDFileSystemStorage(location=self.temp_dir,
                                             base_url='test_uuid_storage')

    def test_file_save_with_path(self):
        """Does saving a pathname create directories and filenames correctly?"""
        self.assertFalse(self.storage.exists('path/to'))
        file_name = 'filename'
        extension = 'ext'
        f = self.storage.save('path/to/%s.%s' % (file_name, extension),
                              ContentFile('file with path'))
        self.assertTrue(self.storage.exists('path/to'))
        dir_name_created, file_name_created = os.path.split(f)
        file_root_created, extension_created = file_name_created.split('.', 1)
        self.assertEqual(extension_created, extension)
        self.assertTrue(re.match('[a-f0-9]{32}', file_root_created))
Example #5
0
class RssFeedData(models.Model):
    """Store all old RSS data to disk for future analysis."""

    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the RSS feed was found",
        on_delete=models.CASCADE,
        related_name="rss_feed_data",
    )
    filepath = models.FileField(
        help_text="The path of the file in the local storage area.",
        upload_to=make_rss_feed_path,
        storage=UUIDFileSystemStorage(),
        max_length=150,
    )

    @property
    def file_contents(self):
        with open(self.filepath.path, "rb") as f:
            return bz2.decompress(f.read()).decode("utf-8")

    def print_file_contents(self):
        print(self.file_contents)

    def reprocess_item(self, metadata_only=False, index=True):
        """Reprocess the RSS feed

        :param metadata_only: If True, only do the metadata, not the docket
        entries.
        :param index: Whether to save to Solr (note that none will be sent
        when doing medata only since no entries are modified).
        """
        from cl.recap_rss.tasks import merge_rss_feed_contents
        from cl.search.tasks import add_items_to_solr

        rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id))
        rss_feed._parse_text(self.file_contents)
        response = merge_rss_feed_contents(
            rss_feed.data, self.court_id, metadata_only
        )
        if index:
            add_items_to_solr(
                response.get("rds_for_solr", []), "search.RECAPDocument"
            )
Example #6
0
class UUIDFileSystemStorageTest(SimpleTestCase):
    # Borrows from https://github.com/django/django/blob/9cbf48693dcd8df6cb22c183dcc94e7ce62b2921/tests/file_storage/tests.py#L89
    allow_database_queries = True

    def setUp(self):
        self.temp_dir = tempfile.mkdtemp()
        self.storage = UUIDFileSystemStorage(location=self.temp_dir,
                                             base_url="test_uuid_storage")

    def test_file_save_with_path(self):
        """Does saving a pathname create directories and filenames correctly?"""
        self.assertFalse(self.storage.exists("path/to"))
        file_name = "filename"
        extension = "ext"
        f = self.storage.save(
            "path/to/%s.%s" % (file_name, extension),
            ContentFile("file with path"),
        )
        self.assertTrue(self.storage.exists("path/to"))
        dir_name_created, file_name_created = os.path.split(f)
        file_root_created, extension_created = file_name_created.split(".", 1)
        self.assertEqual(extension_created, extension)
        self.assertTrue(re.match("[a-f0-9]{32}", file_root_created))
Example #7
0
class PacerHtmlFiles(AbstractFile):
    """This is a simple object for holding original HTML content from PACER

    We use this object to make sure that for every item we receive from users,
    we can go back and re-parse it one day if we have to. This becomes
    essential as we do more and more data work where we're purchasing content.
    If we don't keep an original copy, a bug could be devastating.
    """
    filepath = models.FileField(
        help_text="The path of the original data from PACER.",
        upload_to=make_recap_data_path,
        storage=UUIDFileSystemStorage(),
        max_length=150,
    )
    upload_type = models.SmallIntegerField(
        help_text="The type of object that is uploaded",
        choices=UPLOAD_TYPE.NAMES,
    )
Example #8
0
 def setUp(self):
     self.temp_dir = tempfile.mkdtemp()
     self.storage = UUIDFileSystemStorage(location=self.temp_dir,
                                          base_url='test_uuid_storage')
Example #9
0
class ProcessingQueue(models.Model):
    AWAITING_PROCESSING = 1
    PROCESSING_SUCCESSFUL = 2
    PROCESSING_FAILED = 3
    PROCESSING_IN_PROGRESS = 4
    QUEUED_FOR_RETRY = 5
    PROCESSING_STATUSES = (
        (AWAITING_PROCESSING, 'Awaiting processing in queue.'),
        (PROCESSING_SUCCESSFUL, 'Item processed successfully.'),
        (PROCESSING_FAILED, 'Item encountered an error while processing.'),
        (PROCESSING_IN_PROGRESS, 'Item is currently being processed.'),
        (QUEUED_FOR_RETRY, 'Item failed processing, but will be retried.'),
    )
    DOCKET = 1
    ATTACHMENT_PAGE = 2
    PDF = 3
    UPLOAD_TYPES = (
        (DOCKET, 'HTML Docket'),
        (ATTACHMENT_PAGE, 'HTML attachment page'),
        (PDF, 'PDF'),
    )
    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the upload was from",
        related_name='recap_processing_queue',
    )
    uploader = models.ForeignKey(
        User,
        help_text="The user that uploaded the item to RECAP.",
        related_name='recap_processing_queue',
    )
    pacer_case_id = models.CharField(
        help_text="The cased ID provided by PACER.",
        max_length=100,
    )
    pacer_doc_id = models.CharField(
        help_text="The ID of the document in PACER. This information is "
        "provided by RECAP.",
        max_length=32,  # Same as in RECAP
        unique=True,
        blank=True,
    )
    document_number = models.CharField(
        help_text="If the file is a document, the number is the "
        "document_number in RECAP docket.",
        max_length=32,
    )
    attachment_number = models.SmallIntegerField(
        help_text="If the file is an attachment, the number is the attachment "
        "number in RECAP docket.",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the uploaded file.",
        upload_to=make_recap_processing_queue_path,
        storage=UUIDFileSystemStorage(),
        max_length=1000,
    )
    status = models.SmallIntegerField(
        help_text="The current status of this upload.",
        choices=PROCESSING_STATUSES,
    )
    upload_type = models.SmallIntegerField(
        help_text="The type of object that is uploaded",
        choices=UPLOAD_TYPES,
    )
    error_message = models.TextField(
        help_text="Any errors that occurred while processing an item",
        blank=True,
    )

    def __unicode__(self):
        if self.upload_type == self.DOCKET:
            return u'ProcessingQueue %s: %s case #%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id,
                self.get_upload_type_display(),
            )
        elif self.upload_type == self.PDF:
            return u'ProcessingQueue %s: %s.%s.%s.%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id or None,
                self.document_number or None,
                self.attachment_number or 0,
                self.get_upload_type_display(),
            )

    class Meta:
        permissions = (("has_recap_upload_access",
                        'Can upload documents to RECAP.'), )
Example #10
0
class ProcessingQueue(models.Model):
    AWAITING_PROCESSING = 1
    PROCESSING_SUCCESSFUL = 2
    PROCESSING_FAILED = 3
    PROCESSING_IN_PROGRESS = 4
    QUEUED_FOR_RETRY = 5
    INVALID_CONTENT = 6
    PROCESSING_STATUSES = (
        (AWAITING_PROCESSING, 'Awaiting processing in queue.'),
        (PROCESSING_SUCCESSFUL, 'Item processed successfully.'),
        (PROCESSING_FAILED, 'Item encountered an error while processing.'),
        (PROCESSING_IN_PROGRESS, 'Item is currently being processed.'),
        (QUEUED_FOR_RETRY, 'Item failed processing, but will be retried.'),
        (INVALID_CONTENT, 'Item failed validity tests.'),
    )
    DOCKET = 1
    ATTACHMENT_PAGE = 2
    PDF = 3
    DOCKET_HISTORY_REPORT = 4
    APPELLATE_DOCKET = 5
    APPELLATE_ATTACHMENT_PAGE = 6
    UPLOAD_TYPES = (
        (DOCKET, 'HTML Docket'),
        (ATTACHMENT_PAGE, 'HTML attachment page'),
        (PDF, 'PDF'),
        (DOCKET_HISTORY_REPORT, 'Docket history report'),
        (APPELLATE_DOCKET, 'Appellate HTML docket'),
        (APPELLATE_ATTACHMENT_PAGE, 'Appellate HTML attachment page'),
    )
    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the upload was from",
        related_name='recap_processing_queue',
    )
    uploader = models.ForeignKey(
        User,
        help_text="The user that uploaded the item to RECAP.",
        related_name='recap_processing_queue',
    )
    pacer_case_id = models.CharField(
        help_text="The cased ID provided by PACER.",
        max_length=100,
        db_index=True,
        blank=True,
    )
    pacer_doc_id = models.CharField(
        help_text="The ID of the document in PACER.",
        max_length=32,  # Same as in RECAP
        blank=True,
        db_index=True,
    )
    document_number = models.BigIntegerField(
        help_text="The docket entry number for the document.",
        blank=True,
        null=True,
    )
    attachment_number = models.SmallIntegerField(
        help_text="If the file is an attachment, the number is the attachment "
        "number on the docket.",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the uploaded file.",
        upload_to=make_recap_processing_queue_path,
        storage=UUIDFileSystemStorage(),
        max_length=1000,
    )
    status = models.SmallIntegerField(
        help_text="The current status of this upload. Possible values are: %s"
        % ', '.join(['(%s): %s' % (t[0], t[1]) for t in PROCESSING_STATUSES]),
        default=AWAITING_PROCESSING,
        choices=PROCESSING_STATUSES,
        db_index=True,
    )
    upload_type = models.SmallIntegerField(
        help_text="The type of object that is uploaded",
        choices=UPLOAD_TYPES,
    )
    error_message = models.TextField(
        help_text="Any errors that occurred while processing an item",
        blank=True,
    )
    debug = models.BooleanField(
        help_text="Are you debugging? Debugging uploads will be validated, but "
        "not saved to the database.",
        default=False,
    )

    # Post process fields
    docket = models.ForeignKey(
        Docket,
        help_text="The docket that was created or updated by this request.",
        null=True,
    )
    docket_entry = models.ForeignKey(
        DocketEntry,
        help_text="The docket entry that was created or updated by this "
        "request, if applicable. Only applies to PDFs uploads.",
        null=True,
    )
    recap_document = models.ForeignKey(
        RECAPDocument,
        help_text="The document that was created or updated by this request, "
        "if applicable. Only applies to PDFs uploads.",
        null=True,
    )

    def __unicode__(self):
        if self.upload_type == self.DOCKET:
            return u'ProcessingQueue %s: %s case #%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id,
                self.get_upload_type_display(),
            )
        elif self.upload_type == self.PDF:
            return u'ProcessingQueue: %s: %s.%s.%s.%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id or None,
                self.document_number or None,
                self.attachment_number or 0,
                self.get_upload_type_display(),
            )
        elif self.upload_type == self.ATTACHMENT_PAGE:
            return u'ProcessingQueue: %s (%s)' % (
                self.pk,
                self.get_upload_type_display(),
            )
        else:
            raise NotImplementedError

    class Meta:
        permissions = (("has_recap_upload_access",
                        'Can upload documents to RECAP.'), )
Example #11
0
class ProcessingQueue(models.Model):
    date_created = models.DateTimeField(
        help_text="The time when this item was created",
        auto_now_add=True,
        db_index=True,
    )
    date_modified = models.DateTimeField(
        help_text="The last moment when the item was modified.",
        auto_now=True,
        db_index=True,
    )
    court = models.ForeignKey(
        Court,
        help_text="The court where the upload was from",
        related_name='recap_processing_queue',
        on_delete=models.CASCADE,
    )
    uploader = models.ForeignKey(
        User,
        help_text="The user that uploaded the item to RECAP.",
        related_name='recap_processing_queue',
        on_delete=models.CASCADE,
    )
    pacer_case_id = models.CharField(
        help_text="The cased ID provided by PACER.",
        max_length=100,
        db_index=True,
        blank=True,
    )
    pacer_doc_id = models.CharField(
        help_text="The ID of the document in PACER.",
        max_length=32,  # Same as in RECAP
        blank=True,
        db_index=True,
    )
    document_number = models.BigIntegerField(
        help_text="The docket entry number for the document.",
        blank=True,
        null=True,
    )
    attachment_number = models.SmallIntegerField(
        help_text="If the file is an attachment, the number is the attachment "
                  "number on the docket.",
        blank=True,
        null=True,
    )
    filepath_local = models.FileField(
        help_text="The path of the uploaded file.",
        upload_to=make_recap_processing_queue_path,
        storage=UUIDFileSystemStorage(),
        max_length=1000,
    )
    status = models.SmallIntegerField(
        help_text="The current status of this upload. Possible values "
                  "are: %s" % ', '.join(['(%s): %s' % (t[0], t[1]) for t in
                                         PROCESSING_STATUS.NAMES]),
        default=PROCESSING_STATUS.ENQUEUED,
        choices=PROCESSING_STATUS.NAMES,
        db_index=True,
    )
    upload_type = models.SmallIntegerField(
        help_text="The type of object that is uploaded",
        choices=UPLOAD_TYPE.NAMES,
    )
    error_message = models.TextField(
        help_text="Any errors that occurred while processing an item",
        blank=True,
    )
    debug = models.BooleanField(
        help_text="Are you debugging? Debugging uploads will be validated, "
                  "but not saved to the database.",
        default=False,
    )

    # Post process fields
    docket = models.ForeignKey(
        Docket,
        help_text="The docket that was created or updated by this request.",
        null=True,
        on_delete=models.CASCADE,
    )
    docket_entry = models.ForeignKey(
        DocketEntry,
        help_text="The docket entry that was created or updated by this "
                  "request, if applicable. Only applies to PDFs uploads.",
        null=True,
        on_delete=models.CASCADE,
    )
    recap_document = models.ForeignKey(
        RECAPDocument,
        help_text="The document that was created or updated by this request, "
                  "if applicable. Only applies to PDFs uploads.",
        null=True,
        on_delete=models.CASCADE,
    )

    def __unicode__(self):
        if self.upload_type in [
                UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.DOCKET_HISTORY_REPORT,
                UPLOAD_TYPE.APPELLATE_DOCKET]:
            return u'ProcessingQueue %s: %s case #%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id,
                self.get_upload_type_display(),
            )
        elif self.upload_type == UPLOAD_TYPE.PDF:
            return u'ProcessingQueue: %s: %s.%s.%s.%s (%s)' % (
                self.pk,
                self.court_id,
                self.pacer_case_id or None,
                self.document_number or None,
                self.attachment_number or 0,
                self.get_upload_type_display(),
            )
        elif self.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE:
            return u'ProcessingQueue: %s (%s)' % (
                self.pk,
                self.get_upload_type_display(),
            )
        else:
            raise NotImplementedError(
                "No __unicode__ method on ProcessingQueue model for upload_"
                "type of %s" % self.upload_type
            )

    class Meta:
        permissions = (
            ("has_recap_upload_access", 'Can upload documents to RECAP.'),
        )

    @property
    def file_contents(self):
        with open(self.filepath_local.path, 'r') as f:
            return f.read().decode('utf-8')

    def print_file_contents(self):
        print(self.file_contents)
Example #12
0
 def setUp(self):
     self.temp_dir = tempfile.mkdtemp()
     self.storage = UUIDFileSystemStorage(location=self.temp_dir,
                                          base_url='test_uuid_storage')