def get_district_attachment_pages(options, rd_pks, tag_names, session): """Get the attachment page information for all of the items selected :param options: The options returned by argparse. Should have the following keys: - queue: The celery queue to use - offset: The offset to skip - limit: The limit to stop after :param rd_pks: A list or ValuesList of RECAPDocument PKs to get attachment pages for. :param tag_names: A list of tags to associate with the downloaded items. :param session: A PACER logged-in PacerSession object :return None """ q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q) for i, rd_pk in enumerate(rd_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() chain( get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q), make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q), process_recap_attachment.s(tag_names=tag_names).set(queue=q), ).apply_async()
def get_district_attachment_pages(options): """Get the attachment page information for all of the items on the dockets :param options: The options returned by argparse. :type options: dict """ q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list('pk', flat=True) for i, rd_pk in enumerate(rd_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 100 == 0: logger.info("Doing item %s: %s", i, rd_pk) throttle.maybe_wait() chain( get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q), make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q), process_recap_attachment.s(tag_names=[TAG]).set(queue=q), ).apply_async()
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query) si.conn.http_connection.close() q = options["queue"] recap_user = User.objects.get(username="******") throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break logger.info( "Doing row %s: rd: %s, docket: %s", i, result["id"], result["docket_id"], ) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s(result["id"], session.cookies).set( queue=q ), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set( queue=q ), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set( queue=q ), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break