def sweep_missing_downloads(): """ Get any documents that somehow are missing. This function attempts to address issue #671 by checking for any missing documents, downloading and parsing them. Hopefully this is a temporary hack that we can soon remove when we deprecate the old RECAP server. :return: None """ two_hours_ago = now() - timedelta(hours=2) rds = RECAPDocument.objects.filter( Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago), is_available=True, page_count=None, ).order_by() for rd in rds: # Download the item to the correct location if it doesn't exist if not os.path.isfile(rd.filepath_local.path): filename = rd.filepath_local.name.rsplit('/', 1)[-1] chain( download_recap_item.si(rd.filepath_ia, filename), set_recap_page_count.si(rd.pk), extract_recap_pdf.s(check_if_needed=False).set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), ).apply_async()
def get_and_merge_items(items, log): """Get the items returned from the RECAP server and merge them into CL. Items is a list of dicts like so, sorted by court, case number, document number and attachment number: [{'attachment_number': '0', 'document_number': '1', 'case_number': '186759', 'court_id': 'almb', 'is_available': '0'}, ... ] Note that all values are strings. The idea is to iterate over all of these dicts, grabbing the docket, and adding any items that have is_available = 1. """ update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS) tasks = [] for prev, item, nxt in previous_and_next(items): if prev is None or item['case_number'] != prev['case_number']: # New case. Get the next docket before getting any PDFs. url = get_docketxml_url(item['court_id'], item['case_number']) logger.info("New docket found at: %s" % url) filename = get_docket_filename(item['court_id'], item['case_number']) tasks.append(download_recap_item.si(url, filename, clobber=True)) # Get the document filename = get_document_filename(item['court_id'], item['case_number'], item['document_number'], item['attachment_number']) location = os.path.join(settings.MEDIA_ROOT, 'recap', filename) if not os.path.isfile(location) and int(item['is_available']): # We don't have it yet, and it's available to get. Get it! url = get_pdf_url(item['court_id'], item['case_number'], filename) tasks.append(download_recap_item.si(url, filename)) if nxt is None or item['case_number'] != nxt['case_number']: # Last item in the case. Send for processing. if len(tasks) > 0: logger.info("Sending %s tasks for processing." % len(tasks)) filename = get_docket_filename(item['court_id'], item['case_number']) chord(tasks)(chain( parse_recap_docket.si(filename, debug=False), extract_recap_pdf.s().set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), )) tasks = [] logger.info("Finished queueing new cases.")
def do_ocr(options): """Do the OCR for any items that need it, then save to the solr index.""" q = options['queue'] rds = RECAPDocument.objects.filter( ocr_status=RECAPDocument.OCR_NEEDED, ).values_list('pk', flat=True).order_by() count = rds.count() throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait() if options['index']: extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q).apply_async() else: chain( extract_recap_pdf.si(pk, skip_ocr=False).set(queue=q), add_or_update_recap_document.s(coalesce_docket=True).set(queue=q), ).apply_async() if i % 1000 == 0: logger.info("Sent %s/%s tasks to celery so far." % (i + 1, count))
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options['queue'] index = options['index'] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk') count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_or_update_recap_document.s().set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info("Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) if options['sweep'] is False: # Only allow one script at a time per court combination. # Note that multiple scripts on multiple machines could still be # run. court_str = '-'.join(sorted(options['courts'])) with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp: try: fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: print("Another instance of this program is running with " "for this combination of courts. Only one instance " "can crawl these courts at a time: '%s'" % court_str) sys.exit(1) # Loop over the PACER sites that have RSS feeds and see if they're # ready to do. courts = Court.objects.filter( jurisdiction__in=[ Court.FEDERAL_BANKRUPTCY, Court.FEDERAL_DISTRICT, ], pacer_has_rss_feed=True, ) if options['courts'] != ['all']: courts = courts.filter(pk__in=options['courts']) iterations_completed = 0 last_trim_date = None while options['iterations'] == 0 or \ iterations_completed < options['iterations']: for court in courts: # Check the last time we successfully got the feed try: feed_status = RssFeedStatus.objects.filter( court=court, is_sweep=options['sweep'], status__in=[ RssFeedStatus.PROCESSING_SUCCESSFUL, RssFeedStatus.UNCHANGED, RssFeedStatus.PROCESSING_IN_PROGRESS, ]).latest('date_created') except RssFeedStatus.DoesNotExist: # First time running it or status items have been nuked by # an admin. Make a dummy object, but no need to actually # save it to the DB. Make it old. lincolns_birthday = make_aware(datetime(1809, 2, 12)) feed_status = RssFeedStatus( date_created=lincolns_birthday, date_last_build=lincolns_birthday, is_sweep=options['sweep'], ) if options['courts'] == ['all'] and options['sweep'] is False: # If it's all courts and it's not a sweep, check if we did # it recently. max_visit_ago = now() - timedelta( seconds=self.RSS_MAX_VISIT_FREQUENCY) if feed_status.date_created > max_visit_ago: # Processed too recently. Try next court. continue # Give a court some time to complete during non-sweep crawls processing_cutoff = now() - timedelta( seconds=self.RSS_MAX_PROCESSING_DURATION) if all([ options['sweep'] is False, feed_status.status == RssFeedStatus.PROCESSING_IN_PROGRESS, feed_status.date_created < processing_cutoff ]): continue # The court is ripe! Crawl it if it has changed. # Make a new object to track the attempted crawl. new_status = RssFeedStatus.objects.create( court_id=court.pk, status=RssFeedStatus.PROCESSING_IN_PROGRESS, is_sweep=options['sweep'], ) # Check if the item needs crawling, and crawl it if so. chain( check_if_feed_changed.s(court.pk, new_status.pk, feed_status.date_last_build), merge_rss_feed_contents.s(court.pk, new_status.pk), send_docket_alerts.s(), # Update recap *documents*, not *dockets*. Updating dockets # requires much more work, and we don't expect to get much # docket information from the RSS feeds. RSS feeds also # have information about hundreds or thousands of # dockets. Updating them all would be very bad. add_or_update_recap_document.s(), mark_status_successful.si(new_status.pk), ).apply_async() # Trim if not too recently trimmed. trim_cutoff_date = now() - timedelta( seconds=self.DELAY_BETWEEN_CACHE_TRIMS) if last_trim_date is None or trim_cutoff_date > last_trim_date: trim_rss_cache.delay() last_trim_date = now() # Wait, then attempt the courts again if iterations not exceeded. iterations_completed += 1 time.sleep(self.DELAY_BETWEEN_ITERATIONS)