def get_attachment_pages(options, tag):
    rd_pks = RECAPDocument.objects.filter(
        tags__name=tag,
        docket_entry__description__icontains='attachment',
    ).values_list('pk', flat=True)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    get_district_attachment_pages(options=options, rd_pks=rd_pks,
                                  tag_names=[tag], session=session)
Beispiel #2
0
def get_attachment_pages(options, tag):
    rd_pks = RECAPDocument.objects.filter(
        tags__name=tag,
        docket_entry__description__icontains="attachment").values_list(
            "pk", flat=True)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    get_district_attachment_pages(options=options,
                                  rd_pks=rd_pks,
                                  tag_names=[tag],
                                  session=session)
def get_att_pages(options):
    rd_pks = RECAPDocument.objects.filter(
        tags__name=TAG,
        docket_entry__docket__court__jurisdiction__in=[
            Court.FEDERAL_DISTRICT,
            Court.FEDERAL_BANKRUPTCY,
        ],
    ).values_list('pk', flat=True)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    get_district_attachment_pages(options=options, rd_pks=rd_pks,
                                  tag_names=[TAG], session=session)
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {
            'rows': page_size,
            'fl': ['id', 'docket_id']
        },
        {
            'group': False,
            'facet': False
        },
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_or_update_recap_document.si([rd.pk]).set(queue=q),
        ).apply_async()
        i += 1
Beispiel #5
0
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options['input_file'], 'r') as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options['queue']
        task = options['task']
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options['queue_length'])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row['idb_docket_number']:
                if task == 'download_student_dockets':
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row['idb_docket_number'].rjust(7, '0')
            elif row['student_docket_number']:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row['student_docket_number']
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                      jurisdiction=Court.FEDERAL_DISTRICT)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row['Case Name'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    with open(options['input_file'], 'r') as f:
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect)
        q = options['queue']
        task = options['task']
        throttle = CeleryThrottle(queue_name=q,
                                  min_items=options['queue_length'])
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, row in enumerate(reader):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break
            throttle.maybe_wait()

            logger.info("Doing row %s: %s", i, row)

            if row['idb_docket_number']:
                if task == 'download_student_dockets':
                    continue
                # Zero-pad the docket number up to seven digits because Excel
                # ate the leading zeros that these would normally have.
                docket_number = row['idb_docket_number'].rjust(7, '0')
            elif row['student_docket_number']:
                # Use the values collected by student
                # researchers, then cleaned up my mlr.
                docket_number = row['student_docket_number']
            else:
                # No docket number; move on.
                continue
            court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                      jurisdiction=Court.FEDERAL_DISTRICT)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=docket_number,
                    court_id=court.pk,
                    cookies=session.cookies,
                    case_name=row['Case Name'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=court.pk,
                    cookies=session.cookies,
                    tag_names=[TAG_NAME],
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
Beispiel #7
0
def get_att_pages(options):
    rd_pks = RECAPDocument.objects.filter(
        tags__name=TAG,
        docket_entry__docket__court__jurisdiction__in=[
            Court.FEDERAL_DISTRICT,
            Court.FEDERAL_BANKRUPTCY,
        ],
    ).values_list("pk", flat=True)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    get_district_attachment_pages(
        options=options, rd_pks=rd_pks, tag_names=[TAG], session=session
    )
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result["id"])
        logger.info(
            "Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]
        )

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(
                queue=q
            ),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
        i += 1
Beispiel #9
0
def get_dockets(options):
    """Download a sample of dockets from PACER matching the 7xx series of NOS
    codes.
    """
    nos_codes = [
        LABOR_LITIGATION_OTHER, LABOR_MANAGEMENT_RELATIONS_ACT,
        LABOR_MANAGEMENT_REPORT_DISCLOSURE, FAIR_LABOR_STANDARDS_ACT_CV,
        RAILWAY_LABOR_ACT, FAMILY_AND_MEDICAL_LEAVE_ACT,
        EMPLOYEE_RETIREMENT_INCOME_SECURITY_ACT
    ]
    sample_size = 300
    items = FjcIntegratedDatabase.objects.filter(
        nature_of_suit__in=nos_codes,
        date_terminated__gt='2009-01-01',
        date_terminated__lt='2018-10-15',
        date_filed__gt='2009-01-01').order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        logger.info("This case is from year: %s", row.date_filed.year)

        throttle.maybe_wait()
        case_name = '%s v. %s' % (row.plaintiff, row.defendant)
        chain(
            get_pacer_case_id_and_title.s(
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                case_name=case_name,
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(court_id=row.district_id,
                                          cookies=session.cookies,
                                          tag_names=[TAG],
                                          **{
                                              'show_parties_and_counsel': True,
                                              'show_terminated_parties': True,
                                              'show_list_of_member_cases': True
                                          }).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by("?")[:sample_size]

    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params,
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                    "doc_num_end": doc_num_end,
                },
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def add_all_nysd_to_cl(options):
    """Alas, there's only one way to get all the cases about a particular
    judge: Get all the cases in the entire jurisdiction. We do that here using
    the iquery.pl endpoint.

    Once added to the DB we'll ensure they're tagged. In the next step, we'll
    download all the tagged items.
    """
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    # IDs obtained by binary search of docket numbers on PACER website.
    earliest_id = 405990
    latest_id = 543051
    for pacer_case_id in range(earliest_id, latest_id):
        if pacer_case_id < options["skip_until"]:
            continue
        if pacer_case_id >= options["limit"] > 0:
            break

        if pacer_case_id % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        throttle.maybe_wait()
        logger.info("Doing pacer_case_id: %s", pacer_case_id)
        make_docket_by_iquery.apply_async(
            args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]),
            queue=q,
        )
Beispiel #12
0
def get_final_docs(options):
    """Get any documents that contain "final" in their description."""
    des = (DocketEntry.objects.filter(
        tags__name=TAG,
        description__icontains="final").order_by("pk").iterator())
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, de in enumerate(des):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info(f"Sent {i} tasks to celery so far.")
        logger.info("Doing row %s", i)
        rd_pks = (de.recap_documents.filter(
            document_type=RECAPDocument.PACER_DOCUMENT, ).exclude(
                pacer_doc_id="").order_by("pk").values_list("pk", flat=True))
        for rd_pk in rd_pks:
            throttle.maybe_wait()
            chain(
                get_pacer_doc_by_rd.s(rd_pk,
                                      pacer_session.cookies,
                                      tag=TAG_FINALS).set(queue=q),
                extract_recap_pdf.si(rd_pk).set(queue=q),
                add_items_to_solr.si([rd_pk],
                                     "search.RECAPDocument").set(queue=q),
            ).apply_async()
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()

    page_size = 10000
    main_query = build_main_query_from_query_string(
        Q_INVOICES,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query).execute()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        rd = RECAPDocument.objects.get(pk=result['id'])
        logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk,
                    result['docket_id'])

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG_PHASE_2)
            i += 1
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            i += 1
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies,
                                  tag=TAG_PHASE_2).set(queue=q),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
        i += 1
Beispiel #14
0
def get_attachment_pages(options):
    """Find docket entries that look like invoices and get their attachment
    pages.
    """
    page_size = 100
    main_query = build_main_query_from_query_string(
        Q_DOCS_ONLY,
        {'rows': page_size, 'fl': ['id', 'docket_id']},
        {'group': False, 'facet': False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r')
    results = si.query().add_extra(**main_query)

    q = options['queue']
    recap_user = User.objects.get(username='******')
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    paginator = Paginator(results, page_size)
    i = 0
    for page_number in range(1, paginator.num_pages + 1):
        paged_results = paginator.page(page_number)
        for result in paged_results.object_list:
            if i < options['offset']:
                i += 1
                continue
            if i >= options['limit'] > 0:
                break

            logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'],
                        result['docket_id'])
            throttle.maybe_wait()
            chain(
                # Query the attachment page and process it
                get_attachment_page_by_rd.s(
                    result['id'], session.cookies).set(queue=q),
                # Take that in a new task and make a PQ object
                make_attachment_pq_object.s(
                    result['id'], recap_user.pk).set(queue=q),
                # And then process that using the normal machinery.
                process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q),
            ).apply_async()
            i += 1
        else:
            # Inner loop exited normally (didn't "break")
            continue
        # Inner loop broke. Break outer loop too.
        break
Beispiel #15
0
def get_attachment_page_by_rd(self, rd_pk, cookies):
    """Get the attachment page for the item in PACER.

    :param rd_pk: The PK of a RECAPDocument object to use as a source.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-on PACER user.
    :return: The attachment report populated with the results
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if not rd.pacer_doc_id:
        # Some docket entries are just text/don't have a pacer_doc_id.
        self.request.callbacks = None
        return

    s = PacerSession(cookies=cookies)
    pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id)
    att_report = AttachmentPage(pacer_court_id, s)
    try:
        att_report.query(rd.pacer_doc_id)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying.",
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting."
            logger.error(msg, exc.response.status_code)
            self.request.callbacks = None
            return
    except requests.RequestException as exc:
        logger.warning("Unable to get attachment page for %s", rd)
        raise self.retry(exc=exc)
    return att_report
Beispiel #16
0
def update_docket_info_iquery(self, d_pk):
    cookies = get_or_cache_pacer_cookies(
        "pacer_scraper",
        settings.PACER_USERNAME,
        password=settings.PACER_PASSWORD,
    )
    s = PacerSession(
        cookies=cookies,
        username=settings.PACER_USERNAME,
        password=settings.PACER_PASSWORD,
    )
    d = Docket.objects.get(pk=d_pk)
    report = CaseQuery(map_cl_to_pacer_id(d.court_id), s)
    try:
        report.query(d.pacer_case_id)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning(
            "Timeout or unknown RequestException on iquery crawl. "
            "Trying again if retries not exceeded."
        )
        if self.request.retries == self.max_retries:
            return
        raise self.retry(exc=exc)
    d = update_docket_metadata(d, report.data)
    d.save()
    add_bankruptcy_data_to_docket(d, report.data)
    add_items_to_solr([d.pk], "search.Docket")
Beispiel #17
0
def get_dockets(options, items, tags, sample_size=0):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        recipients = options['recipients'].split(',')
        print("Recipients list is: %s" % recipients)

        s = PacerSession(username=settings.PACER_USERNAME,
                         password=settings.PACER_PASSWORD)
        s.login()
        report = CaseQueryAdvancedBankruptcy('canb', s)
        t1 = now()
        while True:
            query = 'Pacific'
            report.query(
                name_last=query,
                filed_from=datetime.date(2019, 1, 28),
                filed_to=datetime.date(2019, 1, 30),
            )
            num_results = len(report.data)
            print("Checked '%s' and got %s results" % (query, num_results))
            if num_results > 0:
                print("Sending emails and exiting!")
                send_emails(report, recipients)
                exit(0)

            query = 'PG&E'
            report.query(
                name_last=query,
                filed_from=datetime.date(2019, 1, 28),
                filed_to=datetime.date(2019, 1, 30),
            )
            num_results = len(report.data)
            print("Checked '%s' and got %s results" % (query, num_results))
            if num_results > 0:
                print("Sending emails and exiting!")
                send_emails(report, recipients)
                exit(0)

            time.sleep(options['sleep'])
            t2 = now()
            min_login_frequency = 60 * 30  # thirty minutes
            if (t2 - t1).seconds > min_login_frequency:
                print("Logging in again.")
                s.login()
                t1 = now()
Beispiel #19
0
def get_and_save_free_document_report(self, court_id, start, end, cookies):
    """Download the Free document report and save it to the DB.

    :param self: The Celery task.
    :param court_id: A pacer court id.
    :param start: a date object representing the first day to get results.
    :param end: a date object representing the last day to get results.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :return: None
    """
    s = PacerSession(cookies=cookies,
                     username=settings.PACER_USERNAME,
                     password=settings.PACER_PASSWORD)
    report = FreeOpinionReport(court_id, s)
    try:
        report.query(start, end, sort='case_number')
    except (ConnectionError, ChunkedEncodingError, ReadTimeoutError,
            ReadTimeout, ConnectTimeout) as exc:
        logger.warning("Unable to get free document report results from %s "
                       "(%s to %s). Trying again." % (court_id, start, end))
        if self.request.retries == self.max_retries:
            return PACERFreeDocumentLog.SCRAPE_FAILED
        raise self.retry(exc=exc, countdown=5)
    except SoftTimeLimitExceeded as exc:
        logger.warning("Soft time limit exceeded at %s. %s retries remain.",
                       court_id, (self.max_retries - self.request.retries))
        if self.request.retries == self.max_retries:
            return PACERFreeDocumentLog.SCRAPE_FAILED
        raise self.retry(exc=exc, countdown=5)

    try:
        results = report.data
    except (IndexError, HTTPError) as exc:
        # IndexError: When the page isn't downloaded properly.
        # HTTPError: raise_for_status in parse hit bad status.
        if self.request.retries == self.max_retries:
            return PACERFreeDocumentLog.SCRAPE_FAILED
        raise self.retry(exc=exc, countdown=5)

    for row in results:
        PACERFreeDocumentRow.objects.create(
            court_id=row.court_id,
            pacer_case_id=row.pacer_case_id,
            docket_number=row.docket_number,
            case_name=row.case_name,
            date_filed=row.date_filed,
            pacer_doc_id=row.pacer_doc_id,
            document_number=row.document_number,
            description=row.description,
            nature_of_suit=row.nature_of_suit,
            cause=row.cause,
        )

    return PACERFreeDocumentLog.SCRAPE_SUCCESSFUL
Beispiel #20
0
def get_pacer_case_id_and_title(self, docket_number, court_id, cookies,
                                case_name=None, office_number=None,
                                docket_number_letters=None, ):
    """Get the pacer_case_id and title values for a district court docket. Use
    heuristics to disambiguate the results.

    office_number and docket_number_letters are only needed when they are not
    already part of the docket_number passed in. Multiple parameters are needed
    here to allow flexibility when using this API. Some sources, like the IDB,
    have this data all separated out, so it helps not to try to recreate docket
    numbers from data that comes all pulled apart.

    :param docket_number: The docket number to look up. This is a flexible
    field that accepts a variety of docket number styles.
    :param court_id: The CourtListener court ID for the docket number
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param case_name: The case name to use for disambiguation
    :param office_number: The number (or letter) where the case took place.
    Typically, this is in the beginning of the docket number before the colon.
    This will be used for disambiguation. If you passed it as part of the
    docket number, it is not needed here.
    :param docket_number_letters: These are the letters, (cv, cr, md, etc.)
    that may appear in a docket number. This is used for disambiguation. If
    you passed these letters in the docket number, you do not need to pass
    these letters again here.
    :return: The dict formed by the PossibleCaseNumberApi lookup if a good
    value is identified, else None. The dict takes the form of:
        {
            u'docket_number': force_unicode(node.xpath('./@number')[0]),
            u'pacer_case_id': force_unicode(node.xpath('./@id')[0]),
            u'title': force_unicode(node.xpath('./@title')[0]),
        }
    """
    logger.info("Getting pacer_case_id for docket_number %s in court %s",
                docket_number, court_id)
    s = PacerSession(cookies=cookies)
    report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s)
    try:
        report.query(docket_number)
    except requests.RequestException as exc:
        logger.warning("RequestException while running possible case number "
                       "query. Trying again if retries not exceeded: %s.%s",
                       court_id, docket_number)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return None
        raise self.retry(exc=exc)

    try:
        return report.data(case_name=case_name, office_number=office_number,
                           docket_number_letters=docket_number_letters)
    except ParsingException:
        return None
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''):
    """Download dockets from PACER.

    :param options: Options provided by argparse
    :param items: Items from our FJC IDB database
    :param tags: A list of tag names to associate with the purchased content.
    :param sample_size: The number of items to get. If 0, get them all. Else,
    get only this many and do it randomly.
    :param doc_num_end: Only get docket numbers up to this value to constrain
    costs. If set to an empty string, no constraints are applied. Note that
    applying this value means no unnumbered entries will be retrieved by PACER.
    """

    if sample_size > 0:
        items = items.order_by('?')[:sample_size]

    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(items):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)

        throttle.maybe_wait()
        params = make_fjc_idb_lookup_params(row)
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=row.docket_number,
                court_id=row.district_id,
                cookies=session.cookies,
                **params
            ).set(queue=q),
            filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id=row.district_id,
                cookies=session.cookies,
                tag_names=tags,
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': False,
                    'doc_num_end': doc_num_end,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Beispiel #22
0
def get_pacer_doc_id_with_show_case_doc_url(self, rd_pk, cookies):
    """use the show_case_doc URL to get pacer_doc_id values.

    :param rd_pk: The pk of the RECAPDocument you want to get.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    d = rd.docket_entry.docket
    s = PacerSession(cookies=cookies)
    pacer_court_id = map_cl_to_pacer_id(d.court_id)
    report = ShowCaseDocApi(pacer_court_id, s)
    last_try = (self.request.retries == self.max_retries)
    try:
        if rd.document_type == rd.ATTACHMENT:
            report.query(d.pacer_case_id, rd.document_number,
                         rd.attachment_number)
        else:
            report.query(d.pacer_case_id, rd.document_number)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % rd)
        if last_try:
            return
        else:
            raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            if last_try:
                logger.error("Ran into repeated HTTPErrors. No more retries. "
                             "Aborting.")
                return
            else:
                logger.warning("Ran into HTTPError: %s. Retrying." %
                               exc.response.status_code)
                raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            return
    try:
        pacer_doc_id = report.data
    except ParsingException:
        logger.error("Unable to get redirect for %s" % rd)
        return
    else:
        rd.pacer_doc_id = pacer_doc_id
        rd.save()
        logger.info("Successfully saved pacer_doc_id to rd %s" % rd_pk)
def get_dockets(options):
    """Get the dockets by the particular judge now that we have run iquery
    for all of the cases in the jurisdiction, and now that we have
    """
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    buchwald_id = 450
    ds = (
        Docket.objects.filter(
            court_id="nysd", assigned_to_id=buchwald_id, tags__name=NYSD_TAG
        )
        .exclude(idb_data__nature_of_suit__in=NOS_EXCLUSIONS)
        .exclude(idb_data__isnull=True)
    )
    logger.info("Got %s dockets to download", ds.count())
    for i, d in enumerate(ds):
        if i < options["skip_until"]:
            continue
        if i >= options["limit"] > 0:
            break

        if i % 5000 == 0:
            # Re-authenticate just in case the auto-login mechanism isn't
            # working.
            session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            session.login()

        throttle.maybe_wait()
        logger.info("%s: Doing docket with pk: %s", i, d.pk)
        chain(
            get_docket_by_pacer_case_id.s(
                data={"pacer_case_id": d.pacer_case_id},
                court_id=d.court_id,
                cookies=session.cookies,
                docket_pk=d.pk,
                tag_names=[BUCKWALD_TAG],
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Beispiel #24
0
def get_dockets(options):
    """Download the dockets described in the CSV
    """
    f = options["file"]
    reader = csv.DictReader(f)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(
        username=PACER_USERNAME, password=PACER_PASSWORD
    )
    pacer_session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=make_docket_number(row["filecy"], row["docket"]),
                court_id="ilnb",
                cookies=pacer_session.cookies,
                office_number=row["office"],
                docket_number_letters="bk",
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id="ilnb",
                cookies=pacer_session.cookies,
                tag_names=[TAG],
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": True,
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Beispiel #25
0
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = (
        RECAPDocument.objects.filter(
            tags__name=TAG,
            document_number="1",
            document_type=RECAPDocument.PACER_DOCUMENT,
        )
        .exclude(pacer_doc_id="",)
        .order_by("pk")
        .values_list("pk", flat=True)
        .iterator()
    )
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(
        username=PACER_USERNAME, password=PACER_PASSWORD
    )
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options["offset"]:
            i += 1
            continue
        if i >= options["limit"] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS
            ).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
Beispiel #26
0
def download_pacer_pdf_by_rd(self, rd_pk, pacer_case_id, pacer_doc_id,
                             cookies):
    """Using a RECAPDocument object ID, download the PDF if it doesn't already
    exist.

    :param rd_pk: The PK of the RECAPDocument to download
    :param pacer_case_id: The internal PACER case ID number
    :param pacer_doc_id: The internal PACER document ID to download
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :return: requests.Response object usually containing a PDF, or None if that
    wasn't possible.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id)
    s = PacerSession(cookies=cookies)
    report = FreeOpinionReport(pacer_court_id, s)
    try:
        r = report.download_pdf(pacer_case_id, pacer_doc_id)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning(
                "Ran into HTTPError while getting PDF: %s. "
                "Retrying.", exc.response.status_code)
            if self.request.retries == self.max_retries:
                self.request.callbacks = None
                return
            raise self.retry(exc)
        else:
            logger.error(
                "Ran into unknown HTTPError while getting PDF: %s. "
                "Aborting.", exc.response.status_code)
            self.request.callbacks = None
            return
    except requests.RequestException as exc:
        logger.warning("Unable to get PDF for %s in %s", pacer_doc_id,
                       pacer_case_id)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return
        raise self.retry(exc=exc)
    return r
    def update_any_missing_pacer_case_ids(options):
        """The network requests were making things far too slow and had to be
        disabled during the first pass. With this method, we update any items
        that are missing their pacer case ID value.
        """
        ds = Docket.objects.filter(
            idb_data__isnull=False,
            pacer_case_id=None,
        )
        q = options['queue']
        throttle = CeleryThrottle(queue_name=q)
        session = PacerSession(username=PACER_USERNAME,
                               password=PACER_PASSWORD)
        session.login()
        for i, d in enumerate(queryset_generator(ds)):
            if i < options['offset']:
                continue
            if i >= options['limit'] > 0:
                break

            if i % 5000 == 0:
                # Re-authenticate just in case the auto-login mechanism isn't
                # working.
                session = PacerSession(username=PACER_USERNAME,
                                       password=PACER_PASSWORD)
                session.login()

            throttle.maybe_wait()
            logger.info("Getting pacer_case_id for item %s", d)
            params = make_fjc_idb_lookup_params(d.idb_data)
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=d.pk,
                    docket_number=d.idb_data.docket_number,
                    court_id=d.idb_data.district_id,
                    cookies=session.cookies,
                    **params
                ).set(queue=q),
                update_docket_from_hidden_api.s().set(queue=q),
            ).apply_async()
Beispiel #28
0
def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None:
    """Update the docket info from iquery

    :param self: The Celery task
    :param d_pk: The ID of the docket
    :param court_id: The court of the docket. Needed for throttling by court.
    :return: None
    """
    cookies = get_or_cache_pacer_cookies(
        "pacer_scraper",
        settings.PACER_USERNAME,
        password=settings.PACER_PASSWORD,
    )
    s = PacerSession(
        cookies=cookies,
        username=settings.PACER_USERNAME,
        password=settings.PACER_PASSWORD,
    )
    d = Docket.objects.get(pk=d_pk, court_id=court_id)
    report = CaseQuery(map_cl_to_pacer_id(d.court_id), s)
    try:
        report.query(d.pacer_case_id)
    except (requests.Timeout, requests.RequestException) as exc:
        logger.warning(
            "Timeout or unknown RequestException on iquery crawl. "
            "Trying again if retries not exceeded."
        )
        if self.request.retries == self.max_retries:
            return
        raise self.retry(exc=exc)
    if not report.data:
        return

    save_iquery_to_docket(
        self,
        report.data,
        d,
        tag_names=None,
        add_to_solr=True,
    )
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        recipients = options["recipients"].split(",")
        print("Recipients list is: %s" % recipients)

        s = PacerSession(
            username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD
        )
        s.login()
        report = CaseQueryAdvancedBankruptcy("canb", s)
        t1 = now()
        while True:
            query = "Pacific"
            report.query(
                name_last=query,
                filed_from=datetime.date(2019, 1, 28),
                filed_to=datetime.date(2019, 1, 30),
            )
            num_results = len(report.data)
            print("Checked '%s' and got %s results" % (query, num_results))
            if num_results > 0:
                print("Sending emails and exiting!")
                send_emails(report, recipients)
                exit(0)

            query = "PG&E"
            report.query(
                name_last=query,
                filed_from=datetime.date(2019, 1, 28),
                filed_to=datetime.date(2019, 1, 30),
            )
            num_results = len(report.data)
            print("Checked '%s' and got %s results" % (query, num_results))
            if num_results > 0:
                print("Sending emails and exiting!")
                send_emails(report, recipients)
                exit(0)

            time.sleep(options["sleep"])
            t2 = now()
            min_login_frequency = 60 * 30  # thirty minutes
            if (t2 - t1).seconds > min_login_frequency:
                print("Logging in again.")
                s.login()
                t1 = now()
def get_dockets(options):
    """Download the dockets described in the CSV
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    pacer_session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()
        chain(
            get_pacer_case_id_and_title.s(
                pass_through=None,
                docket_number=make_docket_number(row['filecy'], row['docket']),
                court_id='ilnb',
                cookies=pacer_session.cookies,
                office_number=row['office'],
                docket_number_letters='bk',
            ).set(queue=q),
            get_docket_by_pacer_case_id.s(
                court_id='ilnb',
                cookies=pacer_session.cookies,
                tag_names=[TAG],
                **{
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def get_petitions(options):
    """Just get document number one for every docket that's tagged in this
    collection.
    """
    rds = RECAPDocument.objects.filter(
        tags__name=TAG,
        document_number='1',
        document_type=RECAPDocument.PACER_DOCUMENT,
    ).exclude(
        pacer_doc_id='',
    ).order_by('pk').values_list('pk', flat=True).iterator()
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()
    for i, rd_pk in enumerate(rds):
        if i < options['offset']:
            i += 1
            continue
        if i >= options['limit'] > 0:
            break

        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        logger.info("Doing row %s", i)
        throttle.maybe_wait()

        chain(
            get_pacer_doc_by_rd.s(
                rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q),
            extract_recap_pdf.si(rd_pk).set(queue=q),
            add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q),
        ).apply_async()
Beispiel #32
0
def fetch_docket(self, fq_pk):
    """Fetch a docket from PACER

    This mirrors code elsewhere that gets dockets, but manages status as it
    goes through the process.

    :param fq_pk: The PK of the RECAP Fetch Queue to update.
    :return: None
    """
    fq = PacerFetchQueue.objects.get(pk=fq_pk)
    mark_pq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS)

    cookies = get_pacer_cookie_from_cache(fq.user_id)
    if cookies is None:
        msg = (
            "Cookie cache expired before task could run for user: %s"
            % fq.user_id
        )
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)

    court_id = fq.court_id or getattr(fq.docket, "court_id", None)
    s = PacerSession(cookies=cookies)

    try:
        result = fetch_pacer_case_id_and_title(s, fq, court_id)
    except (requests.RequestException, ReadTimeoutError) as exc:
        msg = "Network error getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except PacerLoginException as exc:
        msg = "PacerLoginException while getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except ParsingException:
        msg = "Unable to parse pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    # result can be one of three values:
    #   None       --> Sealed or missing case
    #   Empty dict --> Didn't run the pacer_case_id lookup (wasn't needed)
    #   Full dict  --> Ran the query, got back results
    if result is None:
        msg = "Cannot find case by docket number (perhaps it's sealed?)"
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    pacer_case_id = getattr(fq.docket, "pacer_case_id", None) or result.get(
        "pacer_case_id"
    )

    if not pacer_case_id:
        msg = "Unable to determine pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    try:
        result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq,)
    except (requests.RequestException, ReadTimeoutError) as exc:
        msg = "Network error getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except ParsingException:
        msg = "Unable to parse pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    msg = "Successfully got and merged docket. Adding to Solr as final step."
    mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
    return result
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = (row["cl_d_docket_number"]
                         or row["cl_d_docket_number (student)"] or None)

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(
            fjc_court_id=row["AO ID"].rjust(2, "0"),
            jurisdiction=Court.FEDERAL_DISTRICT,
        )

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn(
                        "Unable to get pacer_doc_id for item with "
                        "rd_pk: %s. Restricted document?",
                        rd.pk,
                    )
                    continue
                if options["task"] == "add_extra_tags":
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(rd.pk,
                                              session.cookies,
                                              tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], "search.RECAPDocument").set(queue=q),
                    ).apply_async()
    f.close()
def get_dockets(options):
    """Download the dockets described in the CSV according to the `tasks`
    option.
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    task = options['task']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if row['Too Old'] == 'Yes':
            continue
        if row['Appellate/District'].lower() != task:
            # Only do appellate when appellate, and district when district.
            continue

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        if task == 'appellate':
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_docket_entries': True,
                        'show_orig_docket': True,
                        'show_prior_cases': True,
                        'show_associated_cases': True,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        elif task == 'district':
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    case_name=row['Title'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_parties_and_counsel': True,
                        'show_terminated_parties': True,
                        'show_list_of_member_cases': True
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
Beispiel #35
0
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(2048))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break

        throttle.maybe_wait()
        logger.info("Doing row %s: %s", i, row)

        row_tag = f"{PROJECT_TAG_NAME}-{row['id']}"
        if not row["district_ct"]:
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row["docket_no1"],
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    # Do not get the docket entries for now. We're only
                    # interested in the date terminated. If it's an open case,
                    # we'll handle that later.
                    **{
                        "show_docket_entries": False,
                        "show_orig_docket": False,
                        "show_prior_cases": False,
                        "show_associated_cases": False,
                        "show_panel_info": True,
                        "show_party_atty_info": True,
                        "show_caption": True,
                    },
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        else:
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row["docket_no1"],
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    case_name=row["name"],
                ).set(queue=q),
                do_case_query_by_pacer_case_id.s(
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row["cl_court"],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    **{
                        # No docket entries
                        "doc_num_start": 10000,
                        "doc_num_end": 10000,
                        "show_parties_and_counsel": True,
                        "show_terminated_parties": True,
                        "show_list_of_member_cases": True,
                    },
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()

    f.close()
Beispiel #36
0
def get_dockets(options):
    """Download the dockets described in the CSV according to the `tasks`
    option.
    """
    f = options['file']
    reader = csv.DictReader(f)
    q = options['queue']
    task = options['task']
    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        if row['Too Old'] == 'Yes':
            continue
        if row['Appellate/District'].lower() != task:
            # Only do appellate when appellate, and district when district.
            continue

        # All tests pass. Get the docket.
        logger.info("Doing row %s: %s", i, row)
        throttle.maybe_wait()
        if task == 'appellate':
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    tag_names=[TAG],
                    **{
                        'show_docket_entries': True,
                        'show_orig_docket': True,
                        'show_prior_cases': True,
                        'show_associated_cases': True,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        elif task == 'district':
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['Cleaned case_No'],
                    court_id=row['fjc_court_id'],
                    cookies=session.cookies,
                    case_name=row['Title'],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'],
                                              cookies=session.cookies,
                                              tag_names=[TAG],
                                              **{
                                                  'show_parties_and_counsel':
                                                  True,
                                                  'show_terminated_parties':
                                                  True,
                                                  'show_list_of_member_cases':
                                                  True
                                              }).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
def download_dockets(options):
    """Download dockets listed in the spreadsheet."""
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(2048))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break

        throttle.maybe_wait()
        logger.info("Doing row %s: %s", i, row)

        row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id'])
        if not row['district_ct']:
            chain(
                get_appellate_docket_by_docket_number.s(
                    docket_number=row['docket_no1'],
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    # Do not get the docket entries for now. We're only
                    # interested in the date terminated. If it's an open case,
                    # we'll handle that later.
                    **{
                        'show_docket_entries': False,
                        'show_orig_docket': False,
                        'show_prior_cases': False,
                        'show_associated_cases': False,
                        'show_panel_info': True,
                        'show_party_atty_info': True,
                        'show_caption': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()
        else:
            chain(
                get_pacer_case_id_and_title.s(
                    pass_through=None,
                    docket_number=row['docket_no1'],
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    case_name=row['name'],
                ).set(queue=q),
                do_case_query_by_pacer_case_id.s(
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                ).set(queue=q),
                get_docket_by_pacer_case_id.s(
                    court_id=row['cl_court'],
                    cookies=session.cookies,
                    tag_names=[PROJECT_TAG_NAME, row_tag],
                    **{
                        # No docket entries
                        'doc_num_start': 10000,
                        'doc_num_end': 10000,
                        'show_parties_and_counsel': True,
                        'show_terminated_parties': True,
                        'show_list_of_member_cases': True,
                    }
                ).set(queue=q),
                add_or_update_recap_docket.s().set(queue=q),
            ).apply_async()

    f.close()
Beispiel #38
0
def get_appellate_docket_by_docket_number(self,
                                          docket_number,
                                          court_id,
                                          cookies,
                                          tag_names=None,
                                          **kwargs):
    """Get a docket by docket number, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param docket_number: The docket number of the case.
    :param court_id: A courtlistener/PACER appellate court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: The tag name that should be stored with the item in the
    DB, if desired.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    s = PacerSession(cookies=cookies)
    report = AppellateDocketReport(court_id, s)
    logging_id = "%s - %s" % (court_id, docket_number)
    logger.info("Querying docket report %s", logging_id)

    try:
        report.query(docket_number, **kwargs)
    except requests.RequestException as e:
        logger.warning("Problem getting docket %s", logging_id)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return None
        raise self.retry(exc=e)

    docket_data = report.data
    logger.info('Querying and parsing complete for %s', logging_id)

    if docket_data == {}:
        logger.info("Unable to find docket: %s", logging_id)
        self.request.callbacks = None
        return None

    try:
        d = Docket.objects.get(
            docket_number=docket_number,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is None:
        d, count = find_docket_object(court_id, docket_number, docket_number)
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d, og_info = update_docket_appellate_metadata(d, docket_data)
    if not d.pacer_case_id:
        d.pacer_case_id = docket_number

    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Save the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.APPELLATE_DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Beispiel #39
0
def get_docket_by_pacer_case_id(self,
                                data,
                                court_id,
                                cookies,
                                tag_names=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param data: A dict containing:
        Required: 'pacer_case_id': The internal case ID of the item in PACER.
        Optional: 'docket_pk': The ID of the docket to work on to avoid lookups
                  if it's known in advance.
    :param court_id: A courtlistener court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names that should be stored with the item
    in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    :return: A dict indicating if we need to update Solr.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = DocketReport(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    if data.get('docket_pk') is not None:
        d = Docket.objects.get(pk=data['docket_pk'])
    else:
        try:
            d = Docket.objects.get(
                pacer_case_id=pacer_case_id,
                court_id=court_id,
            )
        except Docket.DoesNotExist:
            d = None
        except Docket.MultipleObjectsReturned:
            d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Beispiel #40
0
def do_case_query_by_pacer_case_id(self,
                                   data,
                                   court_id,
                                   cookies,
                                   tag_names=None):
    """Run a case query (iquery.pl) query on a case and save the data

    :param data: A dict containing at least the following: {
        'pacer_case_id': The internal pacer case ID for the item.
    }
    :param court_id: A courtlistener court ID
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names to associate with the docket when
    saving it in the DB.
    :return: A dict with the pacer_case_id and docket_pk values.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = CaseQuery(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    report.query(pacer_case_id)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()

    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.CASE_REPORT_PAGE)
    pacer_file.filepath.save(
        'case_report.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    logger.info("Created/updated docket: %s" % d)
    return {
        'pacer_case_id': pacer_case_id,
        'docket_pk': d.pk,
    }
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options['input_file'], 'r')
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q,
                              min_items=options['queue_length'])
    session = PacerSession(username=PACER_USERNAME,
                           password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = row['cl_d_docket_number'] or \
            row['cl_d_docket_number (student)'] or \
            None

        if not docket_number:
            logger.warn("No docket number found for row: %s", i)
            continue
        court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'),
                                  jurisdiction=Court.FEDERAL_DISTRICT)

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warn("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warn("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warn("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warn("Unable to get pacer_doc_id for item with "
                                "rd_pk: %s. Restricted document?", rd.pk)
                    continue
                if options['task'] == 'add_extra_tags':
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(
                            rd.pk, session.cookies, tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], 'search.RECAPDocument').set(queue=q),
                    ).apply_async()
    f.close()