Python sの例、cl.scrapers.tasks.extract_recap_pdf.s Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scrape_recap_server.py プロジェクト: snorey/courtlistener

def sweep_missing_downloads():
    """
    Get any documents that somehow are missing.
    
    This function attempts to address issue #671 by checking for any missing
    documents, downloading and parsing them. Hopefully this is a temporary 
    hack that we can soon remove when we deprecate the old RECAP server.
    
    :return: None 
    """
    two_hours_ago = now() - timedelta(hours=2)
    rds = RECAPDocument.objects.filter(
        Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago),
        is_available=True,
        page_count=None,
    ).order_by()
    for rd in rds:
        # Download the item to the correct location if it doesn't exist
        if not os.path.isfile(rd.filepath_local.path):
            filename = rd.filepath_local.name.rsplit('/', 1)[-1]
            chain(
                download_recap_item.si(rd.filepath_ia, filename),
                set_recap_page_count.si(rd.pk),
                extract_recap_pdf.s(check_if_needed=False).set(priority=5),
                add_or_update_recap_document.s(coalesce_docket=True),
            ).apply_async()

コード例 #2

0

ファイルを表示

ファイル: scrape_recap_server.py プロジェクト: hungdv136/courtlistener

def sweep_missing_downloads():
    """
    Get any documents that somehow are missing.
    
    This function attempts to address issue #671 by checking for any missing
    documents, downloading and parsing them. Hopefully this is a temporary 
    hack that we can soon remove when we deprecate the old RECAP server.
    
    :return: None 
    """
    two_hours_ago = now() - timedelta(hours=2)
    rds = RECAPDocument.objects.filter(
        Q(date_created__gt=two_hours_ago) | Q(date_modified__gt=two_hours_ago),
        is_available=True,
        page_count=None,
    ).order_by()
    for rd in rds:
        # Download the item to the correct location if it doesn't exist
        if not os.path.isfile(rd.filepath_local.path):
            filename = rd.filepath_local.name.rsplit('/', 1)[-1]
            chain(
                download_recap_item.si(rd.filepath_ia, filename),
                set_recap_page_count.si(rd.pk),
                extract_recap_pdf.s(check_if_needed=False).set(priority=5),
                add_or_update_recap_document.s(coalesce_docket=True),
            ).apply_async()

コード例 #3

0

ファイルを表示

def extract_recap_documents(docs,
                            skip_ocr=False,
                            order_by=None,
                            queue=None,
                            queue_length=100):
    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.

    :param docs: A queryset containing the RECAPDocuments to be processed.
    :type docs: Django Queryset
    :param skip_ocr: Whether OCR should be completed (False) or whether items
    should simply be updated to have status OCR_NEEDED.
    :type skip_ocr: Bool
    :param order_by: An optimization parameter. You may opt to order the
    processing by 'small-first' or 'big-first'.
    :type order_by: str
    :param queue: The celery queue to send the content to.
    :type queue: str
    :param queue_length: The number of items to send to the queue at a time.
    :type queue_length: int
    """
    docs = docs.exclude(filepath_local='')
    if skip_ocr:
        # Focus on the items that we don't know if they need OCR.
        docs = docs.filter(ocr_status=None)
    else:
        # We're doing OCR. Only work with those items that require it.
        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)

    if order_by is not None:
        if order_by == 'small-first':
            docs = docs.order_by('page_count')
        elif order_by == 'big-first':
            docs = docs.order_by('-page_count')

    tasks = []
    completed = 0
    count = docs.count()
    logger.info("There are %s documents to process." % count)
    for pk in docs.values_list('pk', flat=True):
        # Send the items off for processing.
        last_item = (count == completed + 1)
        tasks.append(
            extract_recap_pdf.s(pk, skip_ocr).set(priority=5, queue=queue))

        # Every enqueue_length items, send the tasks to Celery.
        if (len(tasks) >= queue_length) or last_item:
            logger.info("Sent %s tasks to celery. We have sent %s "
                        "items so far." % (len(tasks), completed + 1))
            job = group(*tasks)
            job.apply_async().join()
            tasks = []

        completed += 1

コード例 #4

0

ファイルを表示

ファイル: scrape_recap_server.py プロジェクト: hungdv136/courtlistener

def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'],
                                           item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")

コード例 #5

0

ファイルを表示

ファイル: scrape_recap_server.py プロジェクト: snorey/courtlistener

def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'], item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")