def sweep_missing_downloads():
    """
    Get any documents that somehow are missing.
    
    This function attempts to address issue #671 by checking for any missing
    documents, downloading and parsing them. Hopefully this is a temporary 
    hack that we can soon remove when we deprecate the old RECAP server.
    
    :return: None 
    """
    two_hours_ago = now() - timedelta(hours=2)
    rds = RECAPDocument.objects.filter(
        Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago),
        is_available=True,
        page_count=None,
    ).order_by()
    for rd in rds:
        # Download the item to the correct location if it doesn't exist
        if not os.path.isfile(rd.filepath_local.path):
            filename = rd.filepath_local.name.rsplit('/', 1)[-1]
            chain(
                download_recap_item.si(rd.filepath_ia, filename),
                set_recap_page_count.si(rd.pk),
                extract_recap_pdf.s(check_if_needed=False).set(priority=5),
                add_or_update_recap_document.s(coalesce_docket=True),
            ).apply_async()
def sweep_missing_downloads():
    """
    Get any documents that somehow are missing.
    
    This function attempts to address issue #671 by checking for any missing
    documents, downloading and parsing them. Hopefully this is a temporary 
    hack that we can soon remove when we deprecate the old RECAP server.
    
    :return: None 
    """
    two_hours_ago = now() - timedelta(hours=2)
    rds = RECAPDocument.objects.filter(
        Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago),
        is_available=True,
        page_count=None,
    ).order_by()
    for rd in rds:
        # Download the item to the correct location if it doesn't exist
        if not os.path.isfile(rd.filepath_local.path):
            filename = rd.filepath_local.name.rsplit('/', 1)[-1]
            chain(
                download_recap_item.si(rd.filepath_ia, filename),
                set_recap_page_count.si(rd.pk),
                extract_recap_pdf.s(check_if_needed=False).set(priority=5),
                add_or_update_recap_document.s(coalesce_docket=True),
            ).apply_async()
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'],
                                           item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'], item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
def do_ocr(options):
    """Do the OCR for any items that need it, then save to the solr index."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        ocr_status=RECAPDocument.OCR_NEEDED,
    ).values_list('pk', flat=True).order_by()
    count = rds.count()
    throttle = CeleryThrottle(queue_name=q)
    for i, pk in enumerate(rds):
        throttle.maybe_wait()
        if options['index']:
            extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async()
        else:
            chain(
                extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q),
                add_or_update_recap_document.s(coalesce_docket=True).set(queue=q),
            ).apply_async()
        if i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
Example #6
0
def do_ocr(options):
    """Do the OCR for any items that need it, then save to the solr index."""
    q = options['queue']
    rds = RECAPDocument.objects.filter(
        ocr_status=RECAPDocument.OCR_NEEDED,
    ).values_list('pk', flat=True).order_by()
    count = rds.count()
    throttle = CeleryThrottle(queue_name=q)
    for i, pk in enumerate(rds):
        throttle.maybe_wait()
        if options['index']:
            extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async()
        else:
            chain(
                extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q),
                add_or_update_recap_document.s(coalesce_docket=True).set(queue=q),
            ).apply_async()
        if i % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_or_update_recap_document.s().set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
Example #8
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        if options['sweep'] is False:
            # Only allow one script at a time per court combination.
            # Note that multiple scripts on multiple machines could still be
            # run.
            court_str = '-'.join(sorted(options['courts']))
            with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp:
                try:
                    fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except IOError:
                    print("Another instance of this program is running with "
                          "for this combination of courts. Only one instance "
                          "can crawl these courts at a time: '%s'" % court_str)
                    sys.exit(1)

        # Loop over the PACER sites that have RSS feeds and see if they're
        # ready to do.
        courts = Court.objects.filter(
            jurisdiction__in=[
                Court.FEDERAL_BANKRUPTCY,
                Court.FEDERAL_DISTRICT,
            ],
            pacer_has_rss_feed=True,
        )
        if options['courts'] != ['all']:
            courts = courts.filter(pk__in=options['courts'])

        iterations_completed = 0
        last_trim_date = None
        while options['iterations'] == 0 or \
                iterations_completed < options['iterations']:
            for court in courts:
                # Check the last time we successfully got the feed
                try:
                    feed_status = RssFeedStatus.objects.filter(
                        court=court,
                        is_sweep=options['sweep'],
                        status__in=[
                            RssFeedStatus.PROCESSING_SUCCESSFUL,
                            RssFeedStatus.UNCHANGED,
                            RssFeedStatus.PROCESSING_IN_PROGRESS,
                        ]).latest('date_created')
                except RssFeedStatus.DoesNotExist:
                    # First time running it or status items have been nuked by
                    # an admin. Make a dummy object, but no need to actually
                    # save it to the DB. Make it old.
                    lincolns_birthday = make_aware(datetime(1809, 2, 12))
                    feed_status = RssFeedStatus(
                        date_created=lincolns_birthday,
                        date_last_build=lincolns_birthday,
                        is_sweep=options['sweep'],
                    )
                if options['courts'] == ['all'] and options['sweep'] is False:
                    # If it's all courts and it's not a sweep, check if we did
                    # it recently.
                    max_visit_ago = now() - timedelta(
                        seconds=self.RSS_MAX_VISIT_FREQUENCY)
                    if feed_status.date_created > max_visit_ago:
                        # Processed too recently. Try next court.
                        continue

                # Give a court some time to complete during non-sweep crawls
                processing_cutoff = now() - timedelta(
                    seconds=self.RSS_MAX_PROCESSING_DURATION)
                if all([
                        options['sweep'] is False, feed_status.status ==
                        RssFeedStatus.PROCESSING_IN_PROGRESS,
                        feed_status.date_created < processing_cutoff
                ]):
                    continue

                # The court is ripe! Crawl it if it has changed.
                # Make a new object to track the attempted crawl.
                new_status = RssFeedStatus.objects.create(
                    court_id=court.pk,
                    status=RssFeedStatus.PROCESSING_IN_PROGRESS,
                    is_sweep=options['sweep'],
                )

                # Check if the item needs crawling, and crawl it if so.
                chain(
                    check_if_feed_changed.s(court.pk, new_status.pk,
                                            feed_status.date_last_build),
                    merge_rss_feed_contents.s(court.pk, new_status.pk),
                    send_docket_alerts.s(),
                    # Update recap *documents*, not *dockets*. Updating dockets
                    # requires much more work, and we don't expect to get much
                    # docket information from the RSS feeds. RSS feeds also
                    # have information about hundreds or thousands of
                    # dockets. Updating them all would be very bad.
                    add_or_update_recap_document.s(),
                    mark_status_successful.si(new_status.pk),
                ).apply_async()

            # Trim if not too recently trimmed.
            trim_cutoff_date = now() - timedelta(
                seconds=self.DELAY_BETWEEN_CACHE_TRIMS)
            if last_trim_date is None or trim_cutoff_date > last_trim_date:
                trim_rss_cache.delay()
                last_trim_date = now()

            # Wait, then attempt the courts again if iterations not exceeded.
            iterations_completed += 1
            time.sleep(self.DELAY_BETWEEN_ITERATIONS)