def query_dockets(query_string): """Identify the d_pks for all the dockets that we need to export :param query_string: The query to run as a URL-encoded string (typically starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd' :return: a set of docket PKs to export """ main_query = build_main_query_from_query_string( query_string, {"fl": ["docket_id"]}, {"group": True, "facet": False, "highlight": False}, ) main_query["group.limit"] = 0 main_query["sort"] = "dateFiled asc" si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") search = si.query().add_extra(**main_query) page_size = 1000 paginator = Paginator(search, page_size) d_pks = set() for page_number in paginator.page_range: page = paginator.page(page_number) for item in page: d_pks.add(item["groupValue"]) logger.info( "After %s pages, got back %s results.", len(paginator.page_range), len(d_pks), ) return d_pks
def docket_pks_for_query(query_string): """Yield docket PKs for a query by iterating over the full result set :param query_string: The query to run as a URL-encoded string (typically starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd' :return: The next docket PK in the results """ main_query = build_main_query_from_query_string( query_string, {"fl": ["docket_id"]}, { "group": True, "facet": False, "highlight": False }, ) main_query["group.limit"] = 0 main_query["sort"] = "dateFiled asc" si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") search = si.query().add_extra(**main_query) si.conn.http_connection.close() page_size = 100 paginator = Paginator(search, page_size) for page_number in paginator.page_range: page = paginator.page(page_number) for item in page: yield item["groupValue"]
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False, 'highlight': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'], result['docket_id']) try: rd = RECAPDocument.objects.get(pk=result['id']) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result['id']) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query) si.conn.http_connection.close() q = options["queue"] recap_user = User.objects.get(username="******") throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break logger.info( "Doing row %s: rd: %s, docket: %s", i, result["id"], result["docket_id"], ) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s(result["id"], session.cookies).set( queue=q ), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set( queue=q ), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set( queue=q ), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def handle(self, *args, **options): super(Command, self).handle(*args, **options) logger.info(f"Using PACER username: {PACER_USERNAME}") main_query = build_main_query_from_query_string( QUERY_STRING, {"rows": 10000, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) docket_ids = get_docket_ids(main_query) get_pacer_dockets(options, docket_ids, [BAL_TAG, BAL_TAG_2019])
def handle(self, *args, **options): super(Command, self).handle(*args, **options) logger.info("Using PACER username: %s" % PACER_USERNAME) main_query = build_main_query_from_query_string( QUERY_STRING, {'rows': 10000, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) docket_ids = get_docket_ids(main_query) get_pacer_dockets(options, docket_ids, [BAL_TAG, BAL_TAG_2019])
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def handle(self, *args, **options): super(Command, self).handle(*args, **options) logger.info("Using PACER username: %s" % PACER_USERNAME) main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': 10000, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False }, ) docket_ids = get_docket_ids(main_query) get_pacer_dockets(options, docket_ids, BAL_TAG)