Example #1
0
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options["queue"]
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                "Sent %s/%s tasks to celery for %s so "
                "far." % (completed, count, task_name)
            )
Example #2
0
def get_pdfs(options: OptionsType) -> None:
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = cast(str, options["queue"])
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info(f"{task_name} {count} items from PACER.")
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in rows.iterator():
        throttle.maybe_wait()
        c = chain(
            process_free_opinion_result.si(
                row.pk,
                row.court_id,
                cnt,
            ).set(queue=q),
            get_and_process_free_pdf.s(row.pk, row.court_id).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                f"Sent {completed}/{count} tasks to celery for {task_name} so far."
            )
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))