Ejemplo n.º 1
0
def query_dockets(query_string):
    """Identify the d_pks for all the dockets that we need to export

    :param query_string: The query to run as a URL-encoded string (typically starts
     with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd'
    :return: a set of docket PKs to export
    """
    main_query = build_main_query_from_query_string(
        query_string,
        {"fl": ["docket_id"]},
        {"group": True, "facet": False, "highlight": False},
    )
    main_query["group.limit"] = 0
    main_query["sort"] = "dateFiled asc"
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    search = si.query().add_extra(**main_query)
    page_size = 1000
    paginator = Paginator(search, page_size)
    d_pks = set()
    for page_number in paginator.page_range:
        page = paginator.page(page_number)
        for item in page:
            d_pks.add(item["groupValue"])
    logger.info(
        "After %s pages, got back %s results.",
        len(paginator.page_range),
        len(d_pks),
    )
    return d_pks
Ejemplo n.º 2
0
def docket_pks_for_query(query_string):
    """Yield docket PKs for a query by iterating over the full result set

    :param query_string: The query to run as a URL-encoded string (typically
    starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd'
    :return: The next docket PK in the results
    """
    main_query = build_main_query_from_query_string(
        query_string,
        {"fl": ["docket_id"]},
        {
            "group": True,
            "facet": False,
            "highlight": False
        },
    )
    main_query["group.limit"] = 0
    main_query["sort"] = "dateFiled asc"
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    search = si.query().add_extra(**main_query)
    si.conn.http_connection.close()
    page_size = 100
    paginator = Paginator(search, page_size)
    for page_number in paginator.page_range:
        page = paginator.page(page_number)
        for item in page:
            yield item["groupValue"]
Ejemplo n.º 3
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False,
            'highlight': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'],
                    result['docket_id'])

        try:
            rd = RECAPDocument.objects.get(pk=result['id'])
        except RECAPDocument.DoesNotExist:
            logger.warn("Unable to find RECAP Document with id %s",
                        result['id'])
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
Ejemplo n.º 4
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query)
    si.conn.http_connection.close()

    q = options["queue"]
    recap_user = User.objects.get(username="******")
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options["offset"]:
                i += 1
                continue
            if i >= options["limit"] > 0:
                break

            logger.info(
                "Doing row %s: rd: %s, docket: %s",
                i,
                result["id"],
                result["docket_id"],
            )
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(result["id"], session.cookies).set(
                    queue=q
                ),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(result["id"], recap_user.pk).set(
                    queue=q
                ),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(
                    queue=q
                ),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Ejemplo n.º 5
0
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info(f"Using PACER username: {PACER_USERNAME}")
     main_query = build_main_query_from_query_string(
         QUERY_STRING,
         {"rows": 10000, "fl": ["id", "docket_id"]},
         {"group": False, "facet": False, "highlight": False},
     )
     docket_ids = get_docket_ids(main_query)
     get_pacer_dockets(options, docket_ids, [BAL_TAG, BAL_TAG_2019])
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     main_query = build_main_query_from_query_string(
         QUERY_STRING,
         {'rows': 10000, 'fl': ['id', 'docket_id']},
         {'group': False, 'facet': False},
     )
     docket_ids = get_docket_ids(main_query)
     get_pacer_dockets(options, docket_ids, [BAL_TAG, BAL_TAG_2019])
Ejemplo n.º 7
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {
            "rows": page_size,
            "fl": ["id", "docket_id"]
        },
        {
            "group": False,
            "facet": False,
            "highlight": False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result["id"])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result["docket_id"])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
        i += 1
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Ejemplo n.º 10
0
 def handle(self, *args, **options):
     super(Command, self).handle(*args, **options)
     logger.info("Using PACER username: %s" % PACER_USERNAME)
     main_query = build_main_query_from_query_string(
         QUERY_STRING,
         {
             'rows': 10000,
             'fl': ['id', 'docket_id']
         },
         {
             'group': False,
             'facet': False
         },
     )
     docket_ids = get_docket_ids(main_query)
     get_pacer_dockets(options, docket_ids, BAL_TAG)