Esempio n. 1
0
def get_district_attachment_pages(options, rd_pks, tag_names, session):
    """Get the attachment page information for all of the items selected

    :param options: The options returned by argparse. Should have the following
    keys:
     - queue: The celery queue to use
     - offset: The offset to skip
     - limit: The limit to stop after
    :param rd_pks: A list or ValuesList of RECAPDocument PKs to get attachment
    pages for.
    :param tag_names: A list of tags to associate with the downloaded items.
    :param session: A PACER logged-in PacerSession object
    :return None
    """
    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q)
    for i, rd_pk in enumerate(rd_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        chain(
            get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q),
            make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q),
            process_recap_attachment.s(tag_names=tag_names).set(queue=q),
        ).apply_async()
Esempio n. 2
0
def get_district_attachment_pages(options):
    """Get the attachment page information for all of the items on the dockets

    :param options: The options returned by argparse.
    :type options: dict
    """
    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    rd_pks = RECAPDocument.objects.filter(
        tags__name=TAG,
        docket_entry__docket__court__jurisdiction__in=[
            Court.FEDERAL_DISTRICT,
            Court.FEDERAL_BANKRUPTCY,
        ],
    ).values_list('pk', flat=True)
    for i, rd_pk in enumerate(rd_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if i % 100 == 0:
            logger.info("Doing item %s: %s", i, rd_pk)
        throttle.maybe_wait()
        chain(
            get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q),
            make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q),
            process_recap_attachment.s(tag_names=[TAG]).set(queue=q),
        ).apply_async()
Esempio n. 3
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"], session.cookies).set(
                    queue=q
                ),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"], recap_user.pk).set(
                    queue=q
                ),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(
                    queue=q
                ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break