Exemple #1
0
def get_district_attachment_pages(options):
    """Get the attachment page information for all of the items on the dockets

    :param options: The options returned by argparse.
    :type options: dict
    """
    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    rd_pks = RECAPDocument.objects.filter(
        tags__name=TAG,
        docket_entry__docket__court__jurisdiction__in=[
            Court.FEDERAL_DISTRICT,
            Court.FEDERAL_BANKRUPTCY,
        ],
    ).values_list('pk', flat=True)
    for i, rd_pk in enumerate(rd_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if i % 100 == 0:
            logger.info("Doing item %s: %s", i, rd_pk)
        throttle.maybe_wait()
        chain(
            get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q),
            make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q),
            process_recap_attachment.s(tag_names=[TAG]).set(queue=q),
        ).apply_async()
Exemple #2
0
def get_pacer_dockets(options, row_pks, tag=None):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        row = FjcIntegratedDatabase.objects.get(pk=row_pk)
        get_docket_by_pacer_case_id.apply_async(
            args=(
                row.pacer_case_id,
                map_cl_to_pacer_id(row.district_id),
                pacer_session,
            ),
            kwargs={
                'tag': tag,
                'show_parties_and_counsel': True,
                'show_terminated_parties': True,
                'show_list_of_member_cases': True,
            },
            queue=q,
        )
def get_data(options, row_transform, tags):
    """Download dockets from a csv, then download claims register data
    from those dockets.

    :param options: The options provided at the command line.
    :param row_transform: A function that takes the row as an argument and
    returns a cleaned up version of the row that has the needed attributes.
    This parameter allows this function to be able to work with almost any
    CSV.
    :param tags: Tags you wish to apply to the gathered data.
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        # All tests pass. Get the docket.
        row = row_transform(row)
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        get_docket_and_claims(
            row['docket_number'],
            row['court'],
            row['case_name'],
            session.cookies,
            tags,
            q,
        )
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': False,
                    'doc_num_end': doc_num_end,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def import_financial_disclosures(
    filepath: str,
    skip_until: int,
    queue_name: str,
) -> None:
    """Import financial documents into courtlistener.

    :param filepath: Path to file data to import.
    :param skip_until: ID if any to skip until.
    :param queue_name: The celery queue name.
    :return:None
    """
    throttle = CeleryThrottle(queue_name=queue_name)
    with open(filepath) as f:
        disclosures = json.load(f)

    for data in disclosures:
        if data["id"] < skip_until:
            continue

        # Check download_filepath to see if it has been processed before.
        if has_been_extracted(data):
            logger.info(f"Document already extracted and saved: {data['id']}.")
            continue

        throttle.maybe_wait()

        # Add disclosures to celery queue
        import_disclosure.apply_async(args=[data], queue=queue_name)
def query_and_export(options):
    """Iterate over the query list, place the queries, and then export results

    Our client has provided us with a spreadsheet chalk-full of queries. Our
    task is to take those queries, run them, identify the matched dockets, then
    serialize those dockets to disk as the deliverable for the client.

    :param options: The argparse options
    :return None
    """
    f = options["file"]
    reader = csv.DictReader(f)
    d_pks = set()
    for i, row in enumerate(reader):
        if i < options["query_offset"]:
            continue
        if i >= options["query_limit"] > 0:
            break
        query_params = get_query_from_link(row["Link"])
        logger.info("Doing query: %s", query_params)
        d_pks.update(query_dockets(query_params))

    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    for i, d_pk in enumerate(d_pks):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        if i % 1000 == 0:
            logger.info("Doing item %s with pk %s", i, d_pk)
        throttle.maybe_wait()
        save_ia_docket_to_disk.apply_async(
            args=(d_pk, options["output_directory"]), queue=q,
        )
def do_bulk_export(options):
    """The final step of this project is to bulk export an outrageous
    amount of bankruptcy data from our system.

    Limit/offset work differently than in many other functions. Limit is a
    true hard limit to the number that should get done. A limit of 10 means
    ten items will be done. Offset corresponds to the docket PK below which you
    do not want to process. (It does *not* correspond to the number of
    completed items.)
    """
    q = options['queue']
    offset = options['offset']
    throttle = CeleryThrottle(queue_name=q)
    if offset > 0:
        logger.info("Skipping to dockets with PK greater than %s", offset)
    d_pks = Docket.objects.filter(
        court__jurisdiction=Court.FEDERAL_BANKRUPTCY,
        pk__gt=offset,
    ).order_by('pk').values_list('pk', flat=True)
    for i, d_pk in enumerate(d_pks):
        if i >= options['limit'] > 0:
            break
        logger.info("Doing item %s with pk %s", i, d_pk)
        throttle.maybe_wait()
        save_ia_docket_to_disk.apply_async(
            args=(d_pk, options['output_directory']),
            queue=q,
        )
Exemple #8
0
def get_pacer_doc_ids(options):
    """Get pacer_doc_ids for any item that needs them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    row_pks = RECAPDocument.objects.filter(pacer_doc_id=None, ).exclude(
        document_number=None, ).exclude(
            docket_entry__docket__pacer_case_id=None).exclude(
                docket_entry__docket__court__jurisdiction__in=Court.
                BANKRUPTCY_JURISDICTIONS, ).order_by('pk').values_list(
                    'pk', flat=True)
    completed = 0
    for row_pk in row_pks:
        if completed >= options['count'] > 0:
            break
        if row_pk < options['start_pk'] > 0:
            continue
        throttle.maybe_wait()
        if completed % 1000 == 0:
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()
            logger.info("Sent %s tasks to celery so far. Latest pk: %s" %
                        (completed, row_pk))
        get_pacer_doc_id_with_show_case_doc_url.apply_async(
            args=(row_pk, session),
            queue=q,
        )
        completed += 1
Exemple #9
0
def get_pacer_dockets(options, docket_pks, tag):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s({
                'pacer_case_id': d.pacer_case_id
            },
                                          d.court_id,
                                          cookies=pacer_session.cookies,
                                          **{
                                              'tag_names': [tag],
                                              'show_parties_and_counsel': True,
                                              'show_terminated_parties': True,
                                              'show_list_of_member_cases': True
                                          }).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Exemple #10
0
def upload_to_internet_archive(options, do_non_free=False):
    """Upload items to the Internet Archive."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        Q(ia_upload_failure_count__lt=3) | Q(ia_upload_failure_count=None),
        is_available=True,
        filepath_ia='',
    ).exclude(
        filepath_local='',
    ).values_list(
        'pk',
        flat=True,
    ).order_by()
    if do_non_free:
        rds = rds.filter(Q(is_free_on_pacer=False) | Q(is_free_on_pacer=None))
    else:
        rds = rds.filter(is_free_on_pacer=True)

    count = rds.count()
    logger.info("Sending %s items to Internet Archive." % count)
    throttle = CeleryThrottle(queue_name=q)
    for i, rd in enumerate(rds):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i, count))
        upload_pdf_to_ia.si(rd).set(queue=q).apply_async()
Exemple #11
0
def get_district_attachment_pages(options, rd_pks, tag_names, session):
    """Get the attachment page information for all of the items selected

    :param options: The options returned by argparse. Should have the following
    keys:
     - queue: The celery queue to use
     - offset: The offset to skip
     - limit: The limit to stop after
    :param rd_pks: A list or ValuesList of RECAPDocument PKs to get attachment
    pages for.
    :param tag_names: A list of tags to associate with the downloaded items.
    :param session: A PACER logged-in PacerSession object
    :return None
    """
    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q)
    for i, rd_pk in enumerate(rd_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        chain(
            get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q),
            make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q),
            process_recap_attachment.s(tag_names=tag_names).set(queue=q),
        ).apply_async()
    def update_documents(self, documents):
        sys.stdout.write('Graph size is {0:d} nodes.\n'.format(self.count))
        sys.stdout.flush()
        processed_count = 0
        if self.index == 'concurrently':
            index_during_subtask = True
        else:
            index_during_subtask = False
        throttle = CeleryThrottle(min_items=500)
        for doc in documents:
            throttle.maybe_wait()
            update_document.delay(doc, index_during_subtask)
            processed_count += 1
            self.log_progress(processed_count, doc.pk)

        if self.index == 'all_at_end':
            call_command(
                'cl_update_index',
                '--type', 'opinions',
                '--solr-url', settings.SOLR_OPINION_URL,
                '--noinput',
                '--update',
                '--everything',
                '--do-commit',
            )
        elif self.index == 'False':
            sys.stdout.write("Solr index not updated after running citation "
                             "finder. You may want to do so manually.")
Exemple #13
0
    def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None:
        sys.stdout.write("Graph size is {0:d} nodes.\n".format(self.count))
        sys.stdout.flush()

        index_during_subtask = False
        if self.index == "concurrently":
            index_during_subtask = True

        chunk = []
        chunk_size = 100
        processed_count = 0
        throttle = CeleryThrottle(queue_name=queue_name)
        for opinion_pk in opinion_pks:
            throttle.maybe_wait()
            processed_count += 1
            last_item = self.count == processed_count
            chunk.append(opinion_pk)
            if processed_count % chunk_size == 0 or last_item:
                find_citations_for_opinion_by_pks.apply_async(
                    args=(chunk, index_during_subtask),
                    queue=queue_name,
                )
                chunk = []

            self.log_progress(processed_count, opinion_pk)
def add_all_nysd_to_cl(options):
    """Alas, there's only one way to get all the cases about a particular
    judge: Get all the cases in the entire jurisdiction. We do that here using
    the iquery.pl endpoint.

    Once added to the DB we'll ensure they're tagged. In the next step, we'll
    download all the tagged items.
    """
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    # IDs obtained by binary search of docket numbers on PACER website.
    earliest_id = 405990
    latest_id = 543051
    for pacer_case_id in range(earliest_id, latest_id):
        if pacer_case_id < options["skip_until"]:
            continue
        if pacer_case_id >= options["limit"] > 0:
            break

        if pacer_case_id % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        throttle.maybe_wait()
        logger.info("Doing pacer_case_id: %s", pacer_case_id)
        make_docket_by_iquery.apply_async(
            args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]),
            queue=q,
        )
Exemple #15
0
def get_pacer_dockets(options, row_pks, tag=None):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        row = FjcIntegratedDatabase.objects.get(pk=row_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                row.pacer_case_id,
                row.district_id,
                pacer_session,
                **{'tag': tag, 'show_parties_and_counsel': True,
                   'show_terminated_parties': True,
                   'show_list_of_member_cases': True}
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
    def process_queryset(self, iterable: Iterable, count: int) -> None:
        """Chunks the queryset passed in, and dispatches it to Celery for
        adding to the index.

        :param iterable: An iterable of items to add to Solr.
        :param count: The number of items that will be processed.
        """
        # The count to send in a single Celery task
        chunk_size = 100

        queue = self.options["queue"]
        start_at = self.options["start_at"]
        # Set low throttle. Higher values risk crashing Redis.
        throttle = CeleryThrottle(queue_name=queue)
        processed_count = 0
        chunk = []
        for item in iterable:
            processed_count += 1
            if processed_count < start_at:
                continue
            last_item = count == processed_count
            chunk.append(item)
            if processed_count % chunk_size == 0 or last_item:
                throttle.maybe_wait()
                add_items_to_solr.apply_async(args=(chunk, self.type),
                                              queue=queue)
                chunk = []
                sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                    processed_count, count, processed_count * 1.0 / count))
                self.stdout.flush()
        self.stdout.write("\n")
Exemple #17
0
def get_cover_sheets_for_docket(options, docket_pks, tag=None):
    """Get civil cover sheets for dockets in our system."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    cover_sheet_re = re.compile(r'cover\s*sheet', re.IGNORECASE)
    for i, docket_pk in enumerate(docket_pks):
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rd_pk = RECAPDocument.objects.get(
                document_number=1,
                docket_entry__docket_id=docket_pk,
            ).values_list()
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist) as e:
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            get_pacer_doc_by_rd_and_description.apply_async(
                args=(
                    rd_pk,
                    cover_sheet_re,
                    pacer_session,
                ),
                kwargs={
                    'tag': tag,
                },
                queue=q,
            )
 def process_queryset(
     self,
     items,
     count,
     chunksize=5,
 ):
     """Chunks the queryset passed in, and dispatches it to Celery for
     adding to the index.
     """
     queue = self.options['queue']
     start_at = self.options['start_at']
     # Set low throttle. Higher values risk crashing Redis.
     throttle = CeleryThrottle(min_wait=self.options['min_wait'],
                               queue_name=queue)
     processed_count = 0
     chunk = []
     for item in items:
         processed_count += 1
         if processed_count < start_at:
             continue
         last_item = (count == processed_count)
         chunk.append(item)
         if processed_count % chunksize == 0 or last_item:
             throttle.maybe_wait()
             add_or_update_items.apply_async(args=(chunk, self.type_str),
                                             queue=queue)
             chunk = []
             sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                 processed_count,
                 count,
                 processed_count * 1.0 / count,
             ))
             self.stdout.flush()
     self.stdout.write('\n')
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by("?")[:sample_size]

    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params,
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                    "doc_num_end": doc_num_end,
                },
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def do_bulk_export(options):
    """Save selected dockets from 2016 to disk

    This will serialize the items to disk using celery tasks and the IA
    serializer.
    """
    q = options["queue"]
    offset = options["offset"]
    throttle = CeleryThrottle(queue_name=q)
    if offset > 0:
        logger.info("Skipping dockets with PK less than than %s", offset)
    d_pks = (Docket.objects.filter(
        court__jurisdiction=Court.FEDERAL_DISTRICT,
        pk__gt=offset,
        source__in=Docket.RECAP_SOURCES,
        date_filed__gte="2016-01-01",
        date_filed__lte="2016-12-31",
    ).order_by("pk").values_list("pk", flat=True))
    for i, d_pk in enumerate(d_pks):
        if i >= options["limit"] > 0:
            break
        logger.info("Doing item %s with pk %s", i, d_pk)
        throttle.maybe_wait()
        save_ia_docket_to_disk.apply_async(
            args=(d_pk, options["output_directory"]),
            queue=q,
        )
Exemple #21
0
    def update_documents(self, documents):
        sys.stdout.write('Graph size is {0:d} nodes.\n'.format(self.count))
        sys.stdout.flush()
        processed_count = 0
        if self.index == 'concurrently':
            index_during_subtask = True
        else:
            index_during_subtask = False
        throttle = CeleryThrottle(min_items=500)
        for doc in documents:
            throttle.maybe_wait()
            update_document.delay(doc, index_during_subtask)
            processed_count += 1
            self.log_progress(processed_count, doc.pk)

        if self.index == 'all_at_end':
            call_command(
                'cl_update_index',
                '--type',
                'opinions',
                '--solr-url',
                settings.SOLR_OPINION_URL,
                '--noinput',
                '--update',
                '--everything',
                '--do-commit',
            )
        elif self.index == 'False':
            sys.stdout.write("Solr index not updated after running citation "
                             "finder. You may want to do so manually.")
Exemple #22
0
def get_pacer_dockets(options, docket_pks, tags):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
            logger.info(f"Sent {i} tasks to celery so far.")
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                {"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk},
                d.court_id,
                cookies=pacer_session.cookies,
                tag_names=tags,
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                },
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Exemple #23
0
    def join_fjc_with_dockets(options):
        idb_rows = (FjcIntegratedDatabase.objects.filter(dataset_source__in=[
            CV_2017, CV_2020
        ], ).values_list("pk", flat=True).order_by("pk"))
        if options["court_id"]:
            idb_rows = idb_rows.filter(district_id=options["court_id"])

        logger.info("%s items will be merged or created.", idb_rows.count())
        q = options["queue"]
        throttle = CeleryThrottle(queue_name=q)
        chunk_size = 25
        for i, idb_chunk in enumerate(chunks(idb_rows.iterator(), chunk_size)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            # Consume the chunk so the iterator works properly
            idb_chunk = list(idb_chunk)
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break
            throttle.maybe_wait()
            msg = "%s: Merging/creating new dockets for IDB chunk of %s items"
            logger.info(msg, i, chunk_size)
            create_or_merge_from_idb_chunk.apply_async(args=(idb_chunk, ),
                                                       queue=q)
Exemple #24
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        do_missing_date_filed = options["do_missing_date_filed"]
        if do_missing_date_filed:
            docket_ids = get_docket_ids_missing_info(do_missing_date_filed)
        else:
            docket_ids = get_docket_ids(last_x_days=options["day_count"])
        # docket_ids = get_docket_ids().union(get_docket_ids_missing_info(100000)) #once initial scrape filling in date_filed is done, uncomment this to do these nightly
        logger.info(
            "iQuery crawling starting up. Will crawl %s dockets",
            len(docket_ids),
        )
        queue = options["queue"]
        throttle = CeleryThrottle(queue_name=queue)
        now = datetime.now().date()
        include_old_terminated = options["include_old_terminated"]
        for i, docket_id in enumerate(docket_ids):
            throttle.maybe_wait()

            if i % 500 == 0:
                logger.info("Sent %s items to celery for crawling so far.", i)

            d = Docket.objects.filter(pk=docket_id).select_related("court")[0]
            too_many_days_old = 90
            terminated_too_long_ago = (
                d.date_terminated
                and (now - d.date_terminated).days > too_many_days_old
            )
            last_filing_too_long_ago = (
                d.date_last_filing
                and (now - d.date_last_filing).days > too_many_days_old
            )
            if all(
                [
                    not include_old_terminated,
                    terminated_too_long_ago,
                    last_filing_too_long_ago,
                    d.date_filed,
                    d.case_name,
                ]
            ):
                # Skip old terminated cases, but do them if we're missing date_filed or case_name
                continue

            if not d.pacer_case_id:
                # No case ID, can't crawl it. Skip.
                continue

            if d.court.jurisdiction not in [
                Court.FEDERAL_DISTRICT,
                Court.FEDERAL_BANKRUPTCY,
            ]:
                # Appeals or other kind of court that got sweapt up. Punt.
                continue

            update_docket_info_iquery.apply_async(
                args=(docket_id,), queue=queue
            )

        logger.info("Done!")
def upload_pdfs_to_internet_archive(options, do_non_free=False):
    """Upload items to the Internet Archive."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        Q(ia_upload_failure_count__lt=3) | Q(ia_upload_failure_count=None),
        is_available=True,
        filepath_ia='',
    ).exclude(
        filepath_local='',
    ).values_list(
        'pk',
        flat=True,
    ).order_by()
    if do_non_free:
        rds = rds.filter(Q(is_free_on_pacer=False) | Q(is_free_on_pacer=None))
    else:
        rds = rds.filter(is_free_on_pacer=True)

    count = rds.count()
    logger.info("Sending %s items to Internet Archive.", count)
    throttle = CeleryThrottle(queue_name=q)
    for i, rd in enumerate(rds):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far.", i, count)
        upload_pdf_to_ia.si(rd).set(queue=q).apply_async()
Exemple #26
0
def get_final_docs(options):
    """Get any documents that contain "final" in their description."""
    des = (DocketEntry.objects.filter(
        tags__name=TAG,
        description__icontains="final").order_by("pk").iterator())
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, de in enumerate(des):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info(f"Sent {i} tasks to celery so far.")
        logger.info("Doing row %s", i)
        rd_pks = (de.recap_documents.filter(
            document_type=RECAPDocument.PACER_DOCUMENT, ).exclude(
                pacer_doc_id="").order_by("pk").values_list("pk", flat=True))
        for rd_pk in rd_pks:
            throttle.maybe_wait()
            chain(
                get_pacer_doc_by_rd.s(rd_pk,
                                      pacer_session.cookies,
                                      tag=TAG_FINALS).set(queue=q),
                extract_recap_pdf.si(rd_pk).set(queue=q),
                add_items_to_solr.si([rd_pk],
                                     "search.RECAPDocument").set(queue=q),
            ).apply_async()
def get_pacer_dockets(options, docket_pks, tags):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                {'pacer_case_id': d.pacer_case_id,
                 'docket_pk': d.pk},
                d.court_id,
                cookies=pacer_session.cookies,
                tag_names=tags,
                **{'show_parties_and_counsel': True,
                   'show_terminated_parties': True,
                   'show_list_of_member_cases': False}
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Exemple #28
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False,
            'highlight': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'],
                    result['docket_id'])

        try:
            rd = RECAPDocument.objects.get(pk=result['id'])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result['id'])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"], session.cookies).set(
                    queue=q
                ),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"], recap_user.pk).set(
                    queue=q
                ),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(
                    queue=q
                ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result["id"])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result["docket_id"])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
        i += 1
Exemple #31
0
def get_dockets(options, items, tags, sample_size=0):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def add_all_cases_to_cl(
    options: Dict[str, Union[List[str], int, str, float]], ) -> None:
    """Iterate over courts and gather iquery results from them.

    :param options: The options from the handle method
    :return None
    """
    q = options["queue"]
    r = make_redis_interface("CACHE")
    # This is a simple dictionary that's populated with the maximum
    # pacer_case_id in the CL DB as of 2021-01-18. The idea is to use this to
    # prevent the scraper from going forever. You can reset it by querying the
    # latest item in the DB by date_filed, and then using r.hmset to save it.
    max_ids = r.hgetall("iquery_max_ids")

    courts = Court.federal_courts.district_pacer_courts().exclude(
        pk__in=["uscfc", "arb", "cit"])
    if options["courts"] != ["all"]:
        courts = courts.filter(pk__in=options["courts"])
    court_ids = list(courts.values_list("pk", flat=True))

    # Create a queue that's a bit longer than the number of courts we're doing
    throttle = CeleryThrottle(queue_name=q, min_items=len(court_ids) * 2)

    iterations_completed = 0
    db_key_cycle = itertools.cycle(settings.DATABASES.keys())
    while (options["iterations"] == 0
           or iterations_completed < options["iterations"]):
        if len(court_ids) == 0:
            # No more courts. Done!
            break

        for court_id in court_ids:
            throttle.maybe_wait()
            try:
                pacer_case_id = r.hincrby("iquery_status", court_id, 1)
                if pacer_case_id > int(max_ids[court_id]):
                    # Enough scraping. Stop doing this court.
                    court_ids.remove(court_id)
                    # Adjust the throttle queue to be shorter.
                    throttle.set_min(len(court_ids * 2))
                    continue
                make_docket_by_iquery.apply_async(
                    args=(court_id, pacer_case_id, next(db_key_cycle)),
                    queue=q,
                )
            except Exception as e:
                # Cleanup
                r.hincrby("iquery_status", court_id, -1)
                raise e

        iterations_completed += 1
        remaining_iterations = options["iterations"] - iterations_completed
        if remaining_iterations > 0:
            time.sleep(options["iteration_delay"])
def get_doc_by_re_and_de_nums_for_dockets(options,
                                          docket_pks,
                                          regex,
                                          de_nums,
                                          fallback=False,
                                          tag=None):
    """Get civil cover sheets for dockets in our system.

    :param options: The options sent on the command line as a dict.
    :param docket_pks: A list of docket pks to iterate over.
    :param regex: A regex to match on the document description on the attachment
    page. For example, to get initial complaints, set this to
    r'initial\s*complaints'.
    :param de_nums: The docket entry numbers to use when looking for items, as a
    list.
    :param fallback: After loading the attachment page, if we don't find
    something that matches `regex`, should we just grab the main document?
    :param tag: A tag to add to any modified content.
    """
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, docket_pk in enumerate(docket_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rds = RECAPDocument.objects.filter(
                document_number__in=de_nums,
                document_type=RECAPDocument.PACER_DOCUMENT,
                docket_entry__docket_id=docket_pk,
            )
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist):
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            for rd in rds:
                get_pacer_doc_by_rd_and_description.apply_async(
                    args=(
                        rd.pk,
                        regex,
                        pacer_session,
                    ),
                    kwargs={
                        'fallback_to_main_doc': fallback,
                        'tag': tag,
                    },
                    queue=q,
                )
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options["input_file"], "r") as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options["queue"]
        task = options["task"]
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options["queue_length"])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row["idb_docket_number"]:
                if task == "download_student_dockets":
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row["idb_docket_number"].rjust(7, "0")
            elif row["student_docket_number"]:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row["student_docket_number"]
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(
                fjc_court_id=row["AO ID"].rjust(2, "0"),
                jurisdiction=Court.FEDERAL_DISTRICT,
            )
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row["Case Name"],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options['input_file'], 'r') as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options['queue']
        task = options['task']
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options['queue_length'])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row['idb_docket_number']:
                if task == 'download_student_dockets':
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row['idb_docket_number'].rjust(7, '0')
            elif row['student_docket_number']:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row['student_docket_number']
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                      jurisdiction=Court.FEDERAL_DISTRICT)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row['Case Name'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
def upload_oral_arguments_to_internet_archive(options):
    """Upload oral arguments to the Internet Archive"""
    q = options['queue']
    af_pks = Audio.objects.filter(Q(ia_upload_failure_count__lt=3) |
                               Q(ia_upload_failure_count=None),
                               filepath_ia='')\
        .exclude(local_path_mp3='')\
        .values_list('pk', flat=True)\
        .order_by()
    count = len(af_pks)
    logger.info("Sending %s oral argument files to Internet Archive", count)
    throttle = CeleryThrottle(queue_name=q)
    for i, af_pk in enumerate(af_pks):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far.", i, count)
        upload_audio_to_ia.si(af_pk).set(queue=q).apply_async()
def do_ocr(options):
    """Do the OCR for any items that need it, then save to the solr index."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        ocr_status=RECAPDocument.OCR_NEEDED,
    ).values_list('pk', flat=True).order_by()
    count = rds.count()
    throttle = CeleryThrottle(queue_name=q)
    for i, pk in enumerate(rds):
        throttle.maybe_wait()
        if options['index']:
            extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async()
        else:
            chain(
                extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q),
                add_or_update_recap_document.s(coalesce_docket=True).set(queue=q),
            ).apply_async()
        if i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
def get_dockets(options):
    """Download the dockets described in the CSV
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    pacer_session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=make_docket_number(row['filecy'], row['docket']),
                court_id='ilnb',
                cookies=pacer_session.cookies,
                office_number=row['office'],
                docket_number_letters='bk',
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id='ilnb',
                cookies=pacer_session.cookies,
                tag_names=[TAG],
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
def upload_to_internet_archive(options):
    """Upload items to the Internet Archive."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        is_free_on_pacer=True,
        is_available=True,
        filepath_ia='',
    ).exclude(
        filepath_local='',
    ).values_list(
        'pk',
        flat=True,
    ).order_by()
    count = rds.count()
    logger.info("Sending %s items to Internet Archive." % count)
    throttle = CeleryThrottle(queue_name=q)
    for i, rd in enumerate(rds):
        throttle.maybe_wait()
        if i > 0 and i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i, count))
        upload_free_opinion_to_ia.si(rd).set(queue=q).apply_async()
    def update_any_missing_pacer_case_ids(options):
        """The network requests were making things far too slow and had to be
        disabled during the first pass. With this method, we update any items
        that are missing their pacer case ID value.
        """
        ds = Docket.objects.filter(
            idb_data__isnull=False,
            pacer_case_id=None,
        )
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, d in enumerate(queryset_generator(ds)):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            if i % 5000 == 0:
                # Re-authenticate just in case the auto-login mechanism isn't
                # working.
                session = PacerSession(username=PACER_USERNAME,
                                       password=PACER_PASSWORD)
                session.login()

            throttle.maybe_wait()
            logger.info("Getting pacer_case_id for item %s", d)
            params = make_fjc_idb_lookup_params(d.idb_data)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=d.pk,
                    docket_number=d.idb_data.docket_number,
                    court_id=d.idb_data.district_id,
                    cookies=session.cookies,
                    **params
                ).set(queue=q),
                update_docket_from_hidden_api.s().set(queue=q),
            ).apply_async()
Exemple #44
0
def extract_recap_documents(docs, skip_ocr=False, order_by=None, queue=None):
    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.

    :param docs: A queryset containing the RECAPDocuments to be processed.
    :type docs: Django Queryset
    :param skip_ocr: Whether OCR should be completed (False) or whether items
    should simply be updated to have status OCR_NEEDED.
    :type skip_ocr: Bool
    :param order_by: An optimization parameter. You may opt to order the
    processing by 'small-first' or 'big-first'.
    :type order_by: str
    :param queue: The celery queue to send the content to.
    :type queue: str
    """
    docs = docs.exclude(filepath_local='')
    if skip_ocr:
        # Focus on the items that we don't know if they need OCR.
        docs = docs.filter(ocr_status=None)
    else:
        # We're doing OCR. Only work with those items that require it.
        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)

    if order_by is not None:
        if order_by == 'small-first':
            docs = docs.order_by('page_count')
        elif order_by == 'big-first':
            docs = docs.order_by('-page_count')

    count = docs.count()
    throttle = CeleryThrottle(queue_name=queue)
    for i, pk in enumerate(docs.values_list('pk', flat=True)):
        throttle.maybe_wait()
        extract_recap_pdf.apply_async((pk, skip_ocr), priority=5, queue=queue)
        if i % 1000 == 0:
            msg = "Sent %s/%s tasks to celery so far." % (i + 1, count)
            logger.info(msg)
            sys.stdout.write("\r%s" % msg)
            sys.stdout.flush()
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = RECAPDocument.objects.filter(
        tags__name=TAG,
        document_number='1',
        document_type=RECAPDocument.PACER_DOCUMENT,
    ).exclude(
        pacer_doc_id='',
    ).order_by('pk').values_list('pk', flat=True).iterator()
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
        ).order_by('pk')
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            throttle.maybe_wait()
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info("%s: Creating new docket for IDB row: %s",
                            i, idb_row)
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk,),
                    queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info("%s: Merging Docket %s with IDB row: %s",
                            i, d, idb_row)
                merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk),
                                                  queue=q)
            elif count > 1:
                logger.warn("%s: Unable to merge. Got %s dockets for row: %s",
                            i, count, idb_row)
def get_dockets(options):
    """Download the dockets described in the CSV according to the `tasks`
    option.
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    task = options['task']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if row['Too Old'] == 'Yes':
            continue
        if row['Appellate/District'].lower() != task:
            # Only do appellate when appellate, and district when district.
            continue

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        if task == 'appellate':
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_docket_entries': True,
                        'show_orig_docket': True,
                        'show_prior_cases': True,
                        'show_associated_cases': True,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        elif task == 'district':
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    case_name=row['Title'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_parties_and_counsel': True,
                        'show_terminated_parties': True,
                        'show_list_of_member_cases': True
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def upload_recap_data(options):
    """Upload RECAP data to Internet Archive."""
    q = options['queue']
    database = options['database']
    r = redis.StrictRedis(host=settings.REDIS_HOST,
                          port=settings.REDIS_PORT,
                          db=settings.REDIS_DATABASES['CACHE'])
    redis_key = 'recap-docket-last-id'
    last_pk = r.getset(redis_key, 0)
    ds = Docket.objects.filter(
        Q(ia_upload_failure_count__lte=3) | Q(ia_upload_failure_count=None),
        ia_needs_upload=True,
        source__in=Docket.RECAP_SOURCES,
        pk__gt=last_pk,
    ).order_by('pk').only('pk')

    chunk_size = 100  # Small to save memory
    i = 0
    previous_i = None
    delay_count = 0
    t1 = now()
    logger.info("Sending recap dockets to Internet Archive")
    throttle = CeleryThrottle(queue_name=q, min_items=5)
    while True:
        # Start of quarter needs to be re-analyzed every time through the loop.
        # This ensures that if the quarter changes while this runs, we get the
        # new value.
        params = {
            'pk__gt': last_pk,
            'ia_date_first_change__lt': get_start_of_quarter(),
        }
        for d in ds.filter(**params)[:chunk_size]:
            throttle.maybe_wait()
            upload_recap_json.apply_async(args=(d.pk, database), queue=q)
            i += 1
            if i % 100 == 0:
                # Print a useful log line with expected finish date.
                t2 = now()
                elapsed_minutes = float((t2 - t1).seconds) / 60
                try:
                    rate = i / float(elapsed_minutes)
                    logger.info("Uploaded %s dockets to IA so far (%.01f/m)",
                                i, rate)
                except ZeroDivisionError:
                    # First lap through can be completed in less than 1s.
                    pass
            last_pk = d.pk
            r.set(redis_key, last_pk)

        # Detect if we've hit the end of the loop and reset it if so. We do
        # this by keeping track of the last_pk that we saw the last time the
        # for loop changed. If that PK doesn't change after the for loop has
        # run again, then we know we've hit the end of the loop and we should
        # reset it.
        empty_loop = i == previous_i
        if empty_loop:
            # i is the same as the last time the
            # for loop finished. Reset things.
            if last_pk == 0:
                # We went through the for loop a second time and still didn't
                # do anything. Stall with capped back off.
                delay_count += 1
                max_delay = 60 * 30  # Thirty minutes
                delay = min(delay_count * 60, max_delay)
                time.sleep(delay)
            else:
                delay_count = 0
                last_pk = 0
                r.set(redis_key, 0)
        else:
            previous_i = i
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(2048))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        throttle.maybe_wait()
        logger.info("Doing row %s: %s", i, row)

        row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id'])
        if not row['district_ct']:
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['docket_no1'],
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    # Do not get the docket entries for now. We're only
                    # interested in the date terminated. If it's an open case,
                    # we'll handle that later.
                    **{
                        'show_docket_entries': False,
                        'show_orig_docket': False,
                        'show_prior_cases': False,
                        'show_associated_cases': False,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        else:
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['docket_no1'],
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    case_name=row['name'],
                ).set(queue=q),
                do_case_query_by_pacer_case_id.s(
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    **{
                        # No docket entries
                        'doc_num_start': 10000,
                        'doc_num_end': 10000,
                        'show_parties_and_counsel': True,
                        'show_terminated_parties': True,
                        'show_list_of_member_cases': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()

    f.close()
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = row['cl_d_docket_number'] or \
            row['cl_d_docket_number (student)'] or \
            None

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                  jurisdiction=Court.FEDERAL_DISTRICT)

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn("Unable to get pacer_doc_id for item with "
                                "rd_pk: %s. Restricted document?", rd.pk)
                    continue
                if options['task'] == 'add_extra_tags':
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(
                            rd.pk, session.cookies, tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], 'search.RECAPDocument').set(queue=q),
                    ).apply_async()
    f.close()