def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'],
                                           item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
예제 #2
0
def get_and_merge_items(items, log):
    """Get the items returned from the RECAP server and merge them into CL.

    Items is a list of dicts like so, sorted by court, case number, document
    number and attachment number:

    [{'attachment_number': '0',
      'document_number': '1',
      'case_number': '186759',
      'court_id': 'almb',
      'is_available': '0'},
      ...
    ]

    Note that all values are strings. The idea is to iterate over all of these
    dicts, grabbing the docket, and adding any items that have is_available = 1.
    """
    update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS)
    tasks = []
    for prev, item, nxt in previous_and_next(items):
        if prev is None or item['case_number'] != prev['case_number']:
            # New case. Get the next docket before getting any PDFs.
            url = get_docketxml_url(item['court_id'], item['case_number'])
            logger.info("New docket found at: %s" % url)
            filename = get_docket_filename(item['court_id'], item['case_number'])
            tasks.append(download_recap_item.si(url, filename, clobber=True))

        # Get the document
        filename = get_document_filename(item['court_id'], item['case_number'],
                                         item['document_number'],
                                         item['attachment_number'])
        location = os.path.join(settings.MEDIA_ROOT, 'recap', filename)
        if not os.path.isfile(location) and int(item['is_available']):
            # We don't have it yet, and it's available to get. Get it!
            url = get_pdf_url(item['court_id'], item['case_number'], filename)
            tasks.append(download_recap_item.si(url, filename))

        if nxt is None or item['case_number'] != nxt['case_number']:
            # Last item in the case. Send for processing.
            if len(tasks) > 0:
                logger.info("Sending %s tasks for processing." % len(tasks))
                filename = get_docket_filename(item['court_id'],
                                               item['case_number'])
                chord(tasks)(chain(
                    parse_recap_docket.si(filename, debug=False),
                    extract_recap_pdf.s().set(priority=5),
                    add_or_update_recap_document.s(coalesce_docket=True),
                ))
                tasks = []
    logger.info("Finished queueing new cases.")
예제 #3
0
def make_download_tasks(data, line_count, start_line):
    """For every item in the CSV, send it to Celery for processing"""
    previous_casenum = None
    subtasks = []
    completed = 0
    for index, item in data.iterrows():
        if completed < start_line - 1:
            # Skip ahead if start_lines is provided.
            completed += 1
            continue

        if item['casenum'] != previous_casenum:
            # New case, get the docket before getting the pdf
            logger.info("New docket found with casenum: %s" % item['casenum'])
            previous_casenum = item['casenum']
            filename = get_docket_filename(item['court'], item['casenum'])
            url = get_docketxml_url(item['court'], item['casenum'])
            subtasks.append(download_recap_item.subtask((url, filename)))

        # Get the document
        filename = get_document_filename(item['court'], item['casenum'],
                                         item['docnum'], item['subdocnum'])
        url = get_pdf_url(item['court'], item['casenum'], filename)
        subtasks.append(download_recap_item.subtask((url, filename)))

        # Every n items or on the last item, send the subtasks to Celery.
        last_item = (line_count == completed + 1)
        if (len(subtasks) >= 1000) or last_item:
            msg = ("Sent %s subtasks to celery. We have processed %s "
                   "rows so far." % (len(subtasks), completed + 1))
            logger.info(msg)
            print msg
            job = TaskSet(tasks=subtasks)
            job.apply_async().join()
            subtasks = []

        completed += 1
예제 #4
0
def make_download_tasks(data, line_count, start_line):
    """For every item in the CSV, send it to Celery for processing"""
    previous_casenum = None
    subtasks = []
    completed = 0
    for index, item in data.iterrows():
        if completed < start_line - 1:
            # Skip ahead if start_lines is provided.
            completed += 1
            continue

        last_item = (line_count == completed + 1)
        if item['casenum'] != previous_casenum:
            # New case, get the docket before getting the pdf
            logger.info("New docket found with casenum: %s" % item['casenum'])
            previous_casenum = item['casenum']
            filename = get_docket_filename(item['court'], item['casenum'])
            url = get_docketxml_url(item['court'], item['casenum'])
            subtasks.append(download_recap_item.subtask((url, filename)))

        # Get the document
        filename = get_document_filename(item['court'], item['casenum'],
                                         item['docnum'], item['subdocnum'])
        url = get_pdf_url(item['court'], item['casenum'], filename)
        subtasks.append(download_recap_item.subtask((url, filename)))

        # Every n items send the subtasks to Celery.
        if (len(subtasks) >= 1000) or last_item:
            msg = ("Sent %s subtasks to celery. We have processed %s "
                   "rows so far." % (len(subtasks), completed + 1))
            logger.info(msg)
            print msg
            job = TaskSet(tasks=subtasks)
            job.apply_async().join()
            subtasks = []

        completed += 1