def sweep_missing_downloads(): """ Get any documents that somehow are missing. This function attempts to address issue #671 by checking for any missing documents, downloading and parsing them. Hopefully this is a temporary hack that we can soon remove when we deprecate the old RECAP server. :return: None """ two_hours_ago = now() - timedelta(hours=2) rds = RECAPDocument.objects.filter( Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago), is_available=True, page_count=None, ).order_by() for rd in rds: # Download the item to the correct location if it doesn't exist if not os.path.isfile(rd.filepath_local.path): filename = rd.filepath_local.name.rsplit('/', 1)[-1] chain( download_recap_item.si(rd.filepath_ia, filename), set_recap_page_count.si(rd.pk), extract_recap_pdf.s(check_if_needed=False).set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), ).apply_async()
def extract_recap_documents(docs, skip_ocr=False, order_by=None, queue=None, queue_length=100): """Loop over RECAPDocuments and extract their contents. Use OCR if requested. :param docs: A queryset containing the RECAPDocuments to be processed. :type docs: Django Queryset :param skip_ocr: Whether OCR should be completed (False) or whether items should simply be updated to have status OCR_NEEDED. :type skip_ocr: Bool :param order_by: An optimization parameter. You may opt to order the processing by 'small-first' or 'big-first'. :type order_by: str :param queue: The celery queue to send the content to. :type queue: str :param queue_length: The number of items to send to the queue at a time. :type queue_length: int """ docs = docs.exclude(filepath_local='') if skip_ocr: # Focus on the items that we don't know if they need OCR. docs = docs.filter(ocr_status=None) else: # We're doing OCR. Only work with those items that require it. docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) if order_by is not None: if order_by == 'small-first': docs = docs.order_by('page_count') elif order_by == 'big-first': docs = docs.order_by('-page_count') tasks = [] completed = 0 count = docs.count() logger.info("There are %s documents to process." % count) for pk in docs.values_list('pk', flat=True): # Send the items off for processing. last_item = (count == completed + 1) tasks.append( extract_recap_pdf.s(pk, skip_ocr).set(priority=5, queue=queue)) # Every enqueue_length items, send the tasks to Celery. if (len(tasks) >= queue_length) or last_item: logger.info("Sent %s tasks to celery. We have sent %s " "items so far." % (len(tasks), completed + 1)) job = group(*tasks) job.apply_async().join() tasks = [] completed += 1
def get_and_merge_items(items, log): """Get the items returned from the RECAP server and merge them into CL. Items is a list of dicts like so, sorted by court, case number, document number and attachment number: [{'attachment_number': '0', 'document_number': '1', 'case_number': '186759', 'court_id': 'almb', 'is_available': '0'}, ... ] Note that all values are strings. The idea is to iterate over all of these dicts, grabbing the docket, and adding any items that have is_available = 1. """ update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS) tasks = [] for prev, item, nxt in previous_and_next(items): if prev is None or item['case_number'] != prev['case_number']: # New case. Get the next docket before getting any PDFs. url = get_docketxml_url(item['court_id'], item['case_number']) logger.info("New docket found at: %s" % url) filename = get_docket_filename(item['court_id'], item['case_number']) tasks.append(download_recap_item.si(url, filename, clobber=True)) # Get the document filename = get_document_filename(item['court_id'], item['case_number'], item['document_number'], item['attachment_number']) location = os.path.join(settings.MEDIA_ROOT, 'recap', filename) if not os.path.isfile(location) and int(item['is_available']): # We don't have it yet, and it's available to get. Get it! url = get_pdf_url(item['court_id'], item['case_number'], filename) tasks.append(download_recap_item.si(url, filename)) if nxt is None or item['case_number'] != nxt['case_number']: # Last item in the case. Send for processing. if len(tasks) > 0: logger.info("Sending %s tasks for processing." % len(tasks)) filename = get_docket_filename(item['court_id'], item['case_number']) chord(tasks)(chain( parse_recap_docket.si(filename, debug=False), extract_recap_pdf.s().set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), )) tasks = [] logger.info("Finished queueing new cases.")