Beispiel #1
0
def get_cover_sheets_for_docket(options, docket_pks, tag=None):
    """Get civil cover sheets for dockets in our system."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    cover_sheet_re = re.compile(r'cover\s*sheet', re.IGNORECASE)
    for i, docket_pk in enumerate(docket_pks):
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rd_pk = RECAPDocument.objects.get(
                document_number=1,
                docket_entry__docket_id=docket_pk,
            ).values_list()
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist) as e:
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            get_pacer_doc_by_rd_and_description.apply_async(
                args=(
                    rd_pk,
                    cover_sheet_re,
                    pacer_session,
                ),
                kwargs={
                    'tag': tag,
                },
                queue=q,
            )
Beispiel #2
0
def get_pacer_dockets(options, row_pks, tag=None):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        row = FjcIntegratedDatabase.objects.get(pk=row_pk)
        get_docket_by_pacer_case_id.apply_async(
            args=(
                row.pacer_case_id,
                map_cl_to_pacer_id(row.district_id),
                pacer_session,
            ),
            kwargs={
                'tag': tag,
                'show_parties_and_counsel': True,
                'show_terminated_parties': True,
                'show_list_of_member_cases': True,
            },
            queue=q,
        )
Beispiel #3
0
def get_pacer_dockets(options, docket_pks, tags):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
            logger.info(f"Sent {i} tasks to celery so far.")
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                {"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk},
                d.court_id,
                cookies=pacer_session.cookies,
                tag_names=tags,
                **{
                    "show_parties_and_counsel": True,
                    "show_terminated_parties": True,
                    "show_list_of_member_cases": False,
                },
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Beispiel #4
0
 def test_logging_short_username(self):
     """If a username shorter than six characters is provided, do we
     throw an appropriate exception?
     """
     session = PacerSession(username="******", password="******")
     with self.assertRaises(PacerLoginException):
         session.login()
Beispiel #5
0
def get_pacer_dockets(options, docket_pks, tag):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    pacer_session = None
    for i, docket_pk in enumerate(docket_pks):
        if i < options['offset']:
            continue
        if i >= options['limit'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0 or pacer_session is None:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        d = Docket.objects.get(pk=docket_pk)
        chain(
            get_docket_by_pacer_case_id.s({
                'pacer_case_id': d.pacer_case_id
            },
                                          d.court_id,
                                          cookies=pacer_session.cookies,
                                          **{
                                              'tag_names': [tag],
                                              'show_parties_and_counsel': True,
                                              'show_terminated_parties': True,
                                              'show_list_of_member_cases': True
                                          }).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
Beispiel #6
0
 def test_logging_short_password(self):
     """If a short password is provided, do we throw an appropriate
     exception?
     """
     session = PacerSession(username="******", password="******")
     with self.assertRaises(PacerLoginException):
         session.login()
Beispiel #7
0
def get_pacer_doc_ids(options):
    """Get pacer_doc_ids for any item that needs them."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    row_pks = RECAPDocument.objects.filter(pacer_doc_id=None, ).exclude(
        document_number=None, ).exclude(
            docket_entry__docket__pacer_case_id=None).exclude(
                docket_entry__docket__court__jurisdiction__in=Court.
                BANKRUPTCY_JURISDICTIONS, ).order_by('pk').values_list(
                    'pk', flat=True)
    completed = 0
    for row_pk in row_pks:
        if completed >= options['count'] > 0:
            break
        if row_pk < options['start_pk'] > 0:
            continue
        throttle.maybe_wait()
        if completed % 1000 == 0:
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()
            logger.info("Sent %s tasks to celery so far. Latest pk: %s" %
                        (completed, row_pk))
        get_pacer_doc_id_with_show_case_doc_url.apply_async(
            args=(row_pk, session),
            queue=q,
        )
        completed += 1
Beispiel #8
0
def get_pacer_dockets(options, row_pks, tag=None):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        row = FjcIntegratedDatabase.objects.get(pk=row_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                row.pacer_case_id,
                row.district_id,
                pacer_session,
                **{'tag': tag, 'show_parties_and_counsel': True,
                   'show_terminated_parties': True,
                   'show_list_of_member_cases': True}
            ).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def get_doc_by_re_and_de_nums_for_dockets(options,
                                          docket_pks,
                                          regex,
                                          de_nums,
                                          fallback=False,
                                          tag=None):
    """Get civil cover sheets for dockets in our system.

    :param options: The options sent on the command line as a dict.
    :param docket_pks: A list of docket pks to iterate over.
    :param regex: A regex to match on the document description on the attachment
    page. For example, to get initial complaints, set this to
    r'initial\s*complaints'.
    :param de_nums: The docket entry numbers to use when looking for items, as a
    list.
    :param fallback: After loading the attachment page, if we don't find
    something that matches `regex`, should we just grab the main document?
    :param tag: A tag to add to any modified content.
    """
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, docket_pk in enumerate(docket_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        try:
            rds = RECAPDocument.objects.filter(
                document_number__in=de_nums,
                document_type=RECAPDocument.PACER_DOCUMENT,
                docket_entry__docket_id=docket_pk,
            )
        except (RECAPDocument.MultipleObjectsReturned,
                RECAPDocument.DoesNotExist):
            logger.warn("Unable to get document 1 for docket_pk: %s" %
                        docket_pk)
        else:
            for rd in rds:
                get_pacer_doc_by_rd_and_description.apply_async(
                    args=(
                        rd.pk,
                        regex,
                        pacer_session,
                    ),
                    kwargs={
                        'fallback_to_main_doc': fallback,
                        'tag': tag,
                    },
                    queue=q,
                )
Beispiel #10
0
    def test_logging_into_pacer(self):
        try:
            session = PacerSession(username=PACER_USERNAME,
                                   password=PACER_PASSWORD)
            session.login()
            self.assertIsNotNone(session)
            self.assertIsNotNone(session.cookies.get(
                'PacerSession', None, domain='.uscourts.gov', path='/'))

        except PacerLoginException:
            self.fail('Could not log into PACER')
Beispiel #11
0
def get_pacer_case_ids(options, row_pks):
    """Get the PACER case IDs for the given items."""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        throttle.maybe_wait()
        if i % 10000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        get_pacer_case_id_for_idb_row.apply_async(args=(row_pk, pacer_session),
                                                  queue=q)
Beispiel #12
0
def get_pacer_case_ids(options, row_pks):
    """Get the PACER case IDs for an item in the IDB by looking it up on
    PACER"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 10000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        get_pacer_case_id_for_idb_row.apply_async(
            args=(row_pk, pacer_session.cookies),
            queue=q,
        )
Beispiel #13
0
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options["queue"]
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                "Sent %s/%s tasks to celery for %s so "
                "far." % (completed, count, task_name)
            )
Beispiel #14
0
    def setUp(self):
        pacer_session = PacerSession()

        if pacer_credentials_are_defined():
            # CAND chosen at random
            pacer_session = get_pacer_session()
            pacer_session.login()

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            self.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT_EXAMPLES_PACER,
                            'dates/valid_free_opinion_dates.json')
        with open(path) as j:
            self.valid_dates = json.load(j)

        self.reports = {}
        for court in self.courts:
            court_id = get_court_id_from_url(court['court_link'])
            self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
Beispiel #15
0
    def setUp(self):
        pacer_session = PacerSession()

        if PACER_USERNAME and PACER_PASSWORD:
            # CAND chosen at random
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            self.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')
        with open(path) as j:
            self.valid_dates = json.load(j)

        self.reports = {}
        for court in self.courts:
            court_id = get_court_id_from_url(court['court_link'])
            self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
Beispiel #16
0
 def test_logging_in_bad_credentials(self):
     # Make sure password is more than eight characters.
     session = PacerSession(username="******", password="******")
     with self.assertRaises(PacerLoginException):
         session.login()
Beispiel #17
0
def get_and_save_free_document_reports(options):
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    twelve_hrs_ago = now() - timedelta(hours=12)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=twelve_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, )

    cl_court_ids = Court.objects.filter(
        jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY],
        in_use=True,
        end_date=None,
    ).exclude(pk__in=[
        'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb',
        'prb', 'tnwb', 'vib'
    ], ).values_list(
        'pk',
        flat=True,
    )
    pacer_court_ids = {
        map_cl_to_pacer_id(v): {
            'until': now(),
            'count': 1,
            'result': None
        }
        for v in cl_court_ids
    }
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()

    # Iterate over every court, X days at a time. As courts are completed,
    # remove them from the list of courts to process until none are left
    tomorrow = now() + timedelta(days=1)
    while len(pacer_court_ids) > 0:
        court_ids_copy = pacer_court_ids.copy()  # Make a copy of the list.
        for pacer_court_id, delay in court_ids_copy.items():
            if now() < delay['until']:
                # Do other courts until the delay is up. Do not print/log
                # anything since at the end there will only be one court left.
                continue

            next_start_date, next_end_date = get_next_date_range(
                pacer_court_id)
            if delay['result'] is not None:
                if delay['result'].ready():
                    result = delay['result'].get()
                    if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                        if next_start_date >= tomorrow.date():
                            logger.info("Finished '%s'. Marking it complete." %
                                        pacer_court_id)
                            pacer_court_ids.pop(pacer_court_id, None)
                            continue

                    elif result == PACERFreeDocumentLog.SCRAPE_FAILED:
                        logger.error("Encountered critical error on %s "
                                     "(network error?). Marking as failed and "
                                     "pressing on." % pacer_court_id)
                        pacer_court_ids.pop(pacer_court_id, None)
                        continue
                else:
                    next_delay = min(delay['count'] * 5, 30)  # backoff w/cap
                    logger.info(
                        "Court %s still in progress. Delaying at least "
                        "%ss." % (pacer_court_id, next_delay))
                    pacer_court_ids[pacer_court_id]['until'] = now(
                    ) + timedelta(seconds=next_delay)
                    pacer_court_ids[pacer_court_id]['count'] += 1
                    continue

            mark_court_in_progress(pacer_court_id, next_end_date)
            pacer_court_ids[pacer_court_id]['count'] = 1  # Reset
            delay['result'] = chain(
                get_and_save_free_document_report.si(pacer_court_id,
                                                     next_start_date,
                                                     next_end_date,
                                                     pacer_session),
                mark_court_done_on_date.s(pacer_court_id, next_end_date),
            ).apply_async()
Beispiel #18
0
 def test_logging_in_bad_credentials(self):
     session = PacerSession(username='******', password='******')
     with self.assertRaises(PacerLoginException):
         session.login()
Beispiel #19
0
def get_and_save_free_document_reports(options):
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later. For now
    just get the list.

    Note that this uses synchronous celery chains. A previous version was more
    complex and did not use synchronous chains. Unfortunately in Celery 4.2.0,
    or more accurately in redis-py 3.x.x, doing it that way failed nearly every
    time.

    This is a simpler version, though a slower one, but it should get the job
    done.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    three_hrs_ago = now() - timedelta(hours=3)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=three_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, )

    cl_court_ids = Court.objects.filter(
        jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY],
        in_use=True,
        end_date=None,
    ).exclude(pk__in=['casb', 'gub', 'innb', 'miwb', 'ohsb',
                      'prb'], ).values_list(
                          'pk',
                          flat=True,
                      )
    pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids]

    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()

    today = now()
    for pacer_court_id in pacer_court_ids:
        while True:
            next_start_d, next_end_d = get_next_date_range(pacer_court_id)
            logger.info(
                "Attempting to get latest document references for "
                "%s between %s and %s", pacer_court_id, next_start_d,
                next_end_d)
            mark_court_in_progress(pacer_court_id, next_end_d)
            try:
                status = get_and_save_free_document_report(
                    pacer_court_id, next_start_d, next_end_d,
                    pacer_session.cookies)
            except RequestException:
                logger.error(
                    "Failed to get document references for %s "
                    "between %s and %s due to network error.", pacer_court_id,
                    next_start_d, next_end_d)
                mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED,
                                        pacer_court_id, next_end_d)
                break
            except IndexError:
                logger.error(
                    "Failed to get document references for %s "
                    "between %s and %s due to PACER 6.3 bug.", pacer_court_id,
                    next_start_d, next_end_d)
                mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED,
                                        pacer_court_id, next_end_d)
                break
            else:
                result = mark_court_done_on_date(status, pacer_court_id,
                                                 next_end_d)

            if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                if next_end_d >= today.date():
                    logger.info("Got all document references for '%s'.",
                                pacer_court_id)
                    # Break from while loop, onwards to next court
                    break
                else:
                    # More dates to do; let it continue
                    continue

            elif result == PACERFreeDocumentLog.SCRAPE_FAILED:
                logger.error("Encountered critical error on %s "
                             "(network error?). Marking as failed and "
                             "pressing on." % pacer_court_id)
                # Break from while loop, onwards to next court
                break
 def setUp(self):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     pacer_session.login()
     self.report = DocketReport('cand', pacer_session)
     self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates