Ejemplo n.º 1
0
def _cron_process_PDF(obj, ppentry):
    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
        # SSN privacy check
        has_ssn = _has_ssn(obj, filename)
    else:
        has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:
        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename,
                           ssn=has_ssn,
                           blacklist_file=in_blacklist,
                           invalid_PDF=invalid_PDF)

        return

    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=1)
        UploadHandler.do_me_up(docket)

    print "  %s %s" % (filename, put_msg)
Ejemplo n.º 2
0
def _cron_process_PDF(obj, ppentry):

    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
       # SSN privacy check
       has_ssn = _has_ssn(obj, filename)
    else:
       has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename, ssn=has_ssn, blacklist_file= in_blacklist, invalid_PDF= invalid_PDF)

        return


    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=1)
        UploadHandler.do_me_up(docket)


    print "  %s %s" % (filename, put_msg)
Ejemplo n.º 3
0
def _cron_process_docketXML(docket, ppentry):
    ''' Required to have the lock. '''

    court = docket.casemeta["court"]
    casenum = docket.casemeta["pacer_case_num"]

    # Force '0' in the XML on docs that failed to upload.
    _update_docs_availability(docket)

    # The docket filename
    docketname = IACommon.get_docketxml_name(court, casenum)

    # Step 1: Try to fetch the existing docket from IA
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if docketstring:
        # Got the existing docket-- put merged docket file.
        ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring)

        if ia_docket:
            put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry)

            print "  %s %s" % (docketname, put_msg)
        else:
            print "  %s docket parsing error: %s" % (docketname, parse_msg)

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry)

        print "  %s put into existing bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- make the bucket and put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry,
                                         newbucket=1)

        print "  %s put into new bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_URLERROR:
        # Couldn't get the IA docket

        # Unset the processing flag for later
#        ppentry.processing = 0
#        ppentry.save()
        # Leave the pickle file for later
        # Drop Lock Here?

        print "  %s timed out.  wait for next cron." % (docketname)

    else:
        # Unknown fetch error.

        # Unset the processing flag for later
#        ppentry.processing = 0
#        ppentry.save()
        # Drop Lock Here?

        # Leave the pickle file for later
        print "  %s unknown fetch error.  wait for next cron." % (docketname)
Ejemplo n.º 4
0
def get_lock(court, casenum, uploaderid, one_per_uploader=0):

    nonce = DocketXML.generate_new_nonce()

    lock = BucketLock(court=court, casenum=casenum, uploaderid=uploaderid, nonce=nonce)
    try:
        lock.save()
    except IntegrityError:
        # Fail, lock already exists.

        lockquery = BucketLock.objects.filter(court=court).filter(casenum=casenum)
        try:
            lock = lockquery[0]
        except IndexError:
            # No lock exists anymore-- must have just missed it.
            return None, "Locked."
        else:
            # Lock already exists

            # This prevents two cron jobs from requesting the same lock
            if lock.uploaderid == uploaderid and one_per_uploader:
                return None, "You already own this lock (Another cron job?)"
            if lock.uploaderid == uploaderid and not lock.ready:
                return lock.nonce, ""
            if lock.uploaderid == uploaderid and lock.ready and not lock.processing:
                # If we're not currently processing the case, let the same
                # uploader modify it
                lock.ready = 0
                lock.save()
                return lock.nonce, ""
            else:
                return None, "Locked by another user."
    else:
        # Success.
        return nonce, ""
Ejemplo n.º 5
0
def delete_documents_from_docket(court, casenum, documents):
    # Step 1: Get docket and convert into DocketXML
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        print "Could not find docket on IA, exiting...."
        exit()

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        print "Docket parsing error: %s.%s, exiting...." % (court, casenum)
        exit()

    # Step 2: Remove documents from DocketXML object

    for document in documents:
        ia_docket.remove_document(document.docnum, document.subdocnum)

    # Step 3: upload modified xml
    docketbits = ia_docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              ia_docket.casemeta)

    success_status = False
    try:
       response = urllib2.urlopen(request)
    except urllib2.HTTPError, e:
        if e.code == 201 or e.code == 200: # 201 Created: Success!
            print "Updated %s %s docket.xml" % (court, casenum)
            success_status = True
Ejemplo n.º 6
0
def _cron_fetch_update(lock):
    court = unicode(lock.court)
    casenum = unicode(lock.casenum)
    nonce = unicode(lock.nonce)

    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        # Couldn't get the docket.  Try again later.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s couldn't fetch the docket: %d" % (court, casenum,
                                                         fetcherror)
        return

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        # Docket parsing error.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s docket parsing error: %s" % (court, casenum, message)
        return
    elif ia_docket.nonce == nonce or not nonce:
        # Got the docket and it is either:
        # 1. up-to-date (nonce match), or
        #  2. expired (ignore nonce)
        # In both scenarios, update the local DB.
        DocumentManager.update_local_db(ia_docket, ignore_available=0)

        print "  %s.%s fetched and DB updated." % (court, casenum)

        ia_docket_orig_hash = hash(pickle.dumps(ia_docket))

        local_docket = DocumentManager.create_docket_from_local_documents(
            court, casenum)

        if local_docket:
            ia_docket.merge_docket(local_docket)

        ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

        if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
            print " After fetch, some locally stored information was " \
                  "missing from %s.%s. Local info addition scheduled." % (
                      court, casenum)
            UploadHandler.do_me_up(ia_docket)

        # Remove the lock.
        lock.delete()
    else:
        # Got the docket but it is not update to date.  Try again later.
        BucketLockManager.try_lock_later(lock)
        print "  %s.%s fetched, wait more." % (court, casenum)
Ejemplo n.º 7
0
def _cron_me_up(ia_docket, docket, ppentry):
    """ Merge and update docket"""

    ia_court = ia_docket.casemeta["court"]
    ia_casenum = ia_docket.casemeta["pacer_case_num"]

    # Save the original hash to diff with later
    ia_docket_orig_hash = hash(pickle.dumps(ia_docket))
    ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta))

    # Merge ia_docket with our local database information to fill in blank
    # fields that may exist in ia
    local_docket = DocumentManager.create_docket_from_local_documents(
        ia_court, ia_casenum, docket)

    if local_docket:
        ia_docket.merge_docket(local_docket)

    ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

    if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
        print " Some locally stored information was missing from %s.%s. Local info added." % (
            ia_court, ia_casenum)

    # Step 2: Merge new docket into the existing IA docket
    ia_docket.merge_docket(docket)

    # Step 3: If diff, then upload the merged docket
    ia_docket_merged_hash = hash(pickle.dumps(ia_docket))

    if ia_docket_orig_hash != ia_docket_merged_hash:

        # Generate a new nonce for the docket
        ia_docket.nonce = DocketXML.generate_new_nonce()

        ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta))
        casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash

        # Put the docket to IA
        put_result, put_msg = put_docket(ia_docket,
                                         ia_court,
                                         ia_casenum,
                                         ppentry,
                                         casemeta_diff=casemeta_diff)

        return put_result, "merged: %s" % put_msg

    else:
        # No difference between IA docket and this docket, no need to upload.

        filename = ppentry.filename

        # Delete the entry from the DB
        ppentry.delete()
        # Delete the pickle file
        delete_pickle(filename)

        # Return False to reflect "no update"
        return False, "not merged: no diff."
Ejemplo n.º 8
0
def _cron_fetch_update(lock):
    court = unicode(lock.court)
    casenum = unicode(lock.casenum)
    nonce = unicode(lock.nonce)

    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        # Couldn't get the docket.  Try again later.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s couldn't fetch the docket: %d" % (court, casenum,
                                                         fetcherror)
        return

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        # Docket parsing error.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s docket parsing error: %s" % (court, casenum,
                                                    message)
        return
    elif ia_docket.nonce == nonce or not nonce:
        # Got the docket and it is either:
        #  1. up-to-date (nonce match), or
        #  2. expired (ignore nonce)
        # In both scenarios, update the local DB.
        DocumentManager.update_local_db(ia_docket, ignore_available=0)

        print "  %s.%s fetched and DB updated." % (court, casenum)

        ia_docket_orig_hash = hash(pickle.dumps(ia_docket))

        local_docket = DocumentManager.create_docket_from_local_documents(court, casenum)

        if local_docket:
            ia_docket.merge_docket(local_docket)

        ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

        if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
            print " After fetch, some locally stored information was missing from %s.%s. Local info addition scheduled."  % (court, casenum)
            UploadHandler.do_me_up(ia_docket)

        # Remove the lock.
        lock.delete()
    else:
        # Got the docket but it is not update to date.  Try again later.
        BucketLockManager.try_lock_later(lock)
        print "  %s.%s fetched, wait more." % (court, casenum)
Ejemplo n.º 9
0
def _cron_me_up(ia_docket, docket, ppentry):
    """ Merge and update docket"""

    ia_court = ia_docket.casemeta["court"]
    ia_casenum = ia_docket.casemeta["pacer_case_num"]

    # Save the original hash to diff with later
    ia_docket_orig_hash = hash(pickle.dumps(ia_docket))
    ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta))

    # Merge ia_docket with our local database information to fill in blank
    # fields that may exist in ia
    local_docket = DocumentManager.create_docket_from_local_documents(ia_court,
                                                                      ia_casenum,
                                                                      docket)

    if local_docket:
        ia_docket.merge_docket(local_docket)

    ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

    if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
        print " Some locally stored information was missing from %s.%s. Local info added." % (
            ia_court, ia_casenum)

    # Step 2: Merge new docket into the existing IA docket
    ia_docket.merge_docket(docket)

    # Step 3: If diff, then upload the merged docket
    ia_docket_merged_hash = hash(pickle.dumps(ia_docket))

    if ia_docket_orig_hash != ia_docket_merged_hash:

        # Generate a new nonce for the docket
        ia_docket.nonce = DocketXML.generate_new_nonce()

        ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta))
        casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash

        # Put the docket to IA
        put_result, put_msg = put_docket(ia_docket, ia_court, ia_casenum,
                                         ppentry, casemeta_diff=casemeta_diff)

        return put_result, "merged: %s" % put_msg

    else:
        # No difference between IA docket and this docket, no need to upload.

        filename = ppentry.filename

        # Delete the entry from the DB
        ppentry.delete()
        # Delete the pickle file
        delete_pickle(filename)

        # Return False to reflect "no update"
        return False, "not merged: no diff."
Ejemplo n.º 10
0
def _get_docket_from_IA(docket):
    docketstring, fetcherror = IADirect.get_docket_string(docket.get_court(), docket.get_casenum())

    if docketstring:
        # Got the existing docket-- put merged docket file.
        ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring)

        if ia_docket:
            return ia_docket, fetcherror
        else:
            print "  %s docket parsing error: %s" % (docketname, parse_msg)
            return None, parse_msg
    return None, fetcherror
Ejemplo n.º 11
0
def mark_document_as_unavailable(document):
    #if not document.available:
    #    print "Exiting: This document isn't currently available on IA"
    #    print usage()
    #    exit()

    document.available = 0
    document.lastdate = datetime.datetime.now() # this ensures that the archive.recapthelaw will get the update
    document.save()

    docket = DocketXML.make_docket_for_pdf("", document.court, document.casenum, document.docnum,
                                               document.subdocnum, available=0)
    UploadHandler.do_me_up(docket)
Ejemplo n.º 12
0
def mark_document_as_unavailable(document):
    # if not document.available:
    #    print "Exiting: This document isn't currently available on IA"
    #    print usage()
    #    exit()

    document.available = 0
    document.lastdate = datetime.datetime.now()  # this ensures that the archive.recapthelaw will get the update
    document.save()

    docket = DocketXML.make_docket_for_pdf("", document.court,
                                           document.casenum, document.docnum,
                                           document.subdocnum, available=0)
    UploadHandler.do_me_up(docket)
Ejemplo n.º 13
0
def _upload_documents(docket, docmap):
    court = docket.get_court()
    casenum = docket.get_casenum()
    for dockey, filename in docmap.items():
        #TK: abstract this split into a separate function
        docnum, subdocnum = dockey.split('-')
        pdfbits = _unpickle_object(filename)

        # make a docket that contains some metadata (sha1, etc) for this docket
        temp_docket = DocketXML.make_docket_for_pdf(pdfbits, court, casenum,
                                           docnum, subdocnum, available=0, free_import=1)

        docket.merge_docket(temp_docket)

        doc_success, doc_msg = upload_document(pdfbits, court, casenum,
                                                        docnum, subdocnum)
        if doc_success:
            docket.set_document_available(docnum, subdocnum, "1")
        else:
            #TK: I don't think we unlock correctly here
            return False, doc_msg

    return True, "All documents uploaded"
def process_case(casenum):

    # Setup: Grab the lock.
    got_lock, nonce_or_message = lock(court, casenum)

    if got_lock:
        print "got the lock: %s" % (nonce_or_message)
        nonce = nonce_or_message
    else:
        print "could not get lock: %s" % (nonce_or_message)
        add_to_retry(casenum)
        return False

    casedir = "%s/%s" % (dirarg, casenum)

    # Step 1: Parse the docket.html file.
    try:
        docketpath = "%s/docket.html" % casedir
        docketfile = open(docketpath)
        docketbits = docketfile.read()
        docketfile.close()
    except IOError:
        reason = "could not open local docket"
        print "***Skipping %s.%s: %s... " % (court, casenum, reason),
        print_unlock_message(unlock(court, casenum, False))
        del_from_retry(casenum)
        add_to_failed(casenum, reason)
        return False
    else:
        docket = ParsePacer.parse_histdocqry(docketbits, court, casenum)

    if not docket:
        reason = "could not parse local docket"
        print "***Skipping %s.%s: %s... " % (court, casenum, reason),
        print_unlock_message(unlock(court, casenum, False))
        del_from_retry(casenum)
        add_to_failed(casenum, reason)
        return False

    # Step 1a: Try to fetch the the existing IA docket.
    ia_docket = None
    ia_docket_orig_string = ""
    ia_casemeta_orig_hash = ""

    ia_docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if ia_docketstring:

        # Got the existing docket-- parse it.
        ia_docket, parseerror = DocketXML.parse_xml_string(ia_docketstring)
        if not ia_docket:
            reason = "could not parse IA docket: %s" % (parseerror)
            print "***Skipping %s.%s: %s... " % (court, casenum, reason),
            print_unlock_message(unlock(court, casenum, False))
            del_from_retry(casenum)
            add_to_failed(casenum, reason)
            return False
        else:
            # Save the original docket hashes
            ia_docket_orig_string = ia_docketstring
            ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta))

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- ok.
        pass

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- either make_bucket failed or not yet ready.

        if casenum not in bucket_made:
            # If make_bucket failed, try make_bucket again.
            print "  make bucket...",
            make_bucket(casenum)

    elif fetcherror is IADirect.FETCH_TIMEOUT:
        # Couldn't contact IA, skip.
        print "***Skipping %s.%s: IA is down... " % (court, casenum),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    elif not ia_docketstring:
        # Unknown fetch error, skip.
        print "***Skipping %s.%s: unknown docket fetch error: %s..." % \
            (court, casenum, fetcherror),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    # Step 1b: If necessary, merge the two dockets.
    if ia_docket:
        ia_docket.merge_docket(docket)
    else:
        ia_docket = docket

    casedir_ls = os.listdir(casedir)

    index_ls = []
    pdf_ls = []
    for casedocname in casedir_ls:
        if casedocname.endswith("index.html"):
            index_ls.append(casedocname)
        elif casedocname.endswith(".pdf"):
            pdf_ls.append(casedocname)

    # Step 2: Parse each index file
    for indexname in index_ls:

        try:
            indexpath = "%s/%s" % (casedir, indexname)
            indexfile = open(indexpath)
            indexbits = indexfile.read()
            indexfile.close()
        except IOError:
            print "***Could not open file '%s'" % indexpath
            continue

        docnum = indexname.strip("-index.html")
        index_docket = ParsePacer.parse_doc1(indexbits, court, casenum, docnum)
        # Merge this docket into the IA docket
        ia_docket.merge_docket(index_docket)

    # Set initial flag for retrying this case.
    need_to_retry = 0

    # Step 3: Wait for the bucket to be ready
    bucketready = False
    for checkcount in xrange(20):
        bucketready, code = IADirect.check_bucket_ready(court, casenum)
        if bucketready:
            break
        else:
            # Wait 5 seconds and try again.
            time.sleep(5)

    if not bucketready:
        print "***Skipping %s.%s: bucket is not ready... " \
            % (court, casenum),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    # Step 4: Upload each pdf file.
    doccount = 0
    for pdfname in pdf_ls:
        doccount += 1

        print "  uploading document %d/%d..." % (doccount, len(pdf_ls)),

        try:
            pdfpath = "%s/%s" % (casedir, pdfname)
            pdffile = open(pdfpath)
            pdfbits = pdffile.read()
            pdffile.close()
        except IOError:
            print "***Could not open file '%s'" % pdfpath
            continue

        pdfname = pdfname.strip(".pdf")
        split = pdfname.split("-")
        try:
            docnum = unicode(int(split[0]))
        except ValueError:
            # Not an integer.
            print "***Docnum not an integer '%s'" % pdfpath
            continue

        try:
            # converting v3->v4 subdocnums
            subdocnum = unicode(int(split[1]) - 1)
        except IndexError:
            subdocnum = "0"

        doc_docket = DocketXML.make_docket_for_pdf(pdfbits, court, casenum,
                                                   docnum, subdocnum)
        doc_meta = doc_docket.get_document_metadict(docnum, subdocnum)

        # Only upload the PDF if the hash doesn't match the one in IA.
        ia_pdfhash = ia_docket.get_document_sha1(docnum, subdocnum)
        pdfhash = doc_docket.get_document_sha1(docnum, subdocnum)

        if ia_pdfhash != pdfhash:
            pdfstatus, pdferror = \
                IADirect.put_pdf(pdfbits, court, casenum,
                                 docnum, subdocnum, doc_meta)

            if not pdfstatus:
                # PUT failed, mark document as unavailable
                doc_docket.set_document_available(docnum, subdocnum, "0")
                print " fail: %s" % pdferror
                need_to_retry = True
                continue
            else:
                print "done."

            # Add this document's metadata into the ia_docket
            ia_docket.merge_docket(doc_docket)

        else:
            print "same."

    # Step 5: Push the docket to IA, if things have changed.
    print "  docket upload...",

    docket_modified = 0
    ignore_nonce = 0
    ia_docket_merged_string = ia_docket.to_xml()

    if ia_docket_orig_string != ia_docket_merged_string:

        # Assign the docket the new nonce from the lock
        ia_docket.nonce = nonce

        ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta))
        casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash

        putstatus, puterror = \
            IADirect.put_docket(ia_docket, court, casenum,
                                casemeta_diff=casemeta_diff)

        if putstatus:
            docket_modified = 1
            print "done."
        else:
            need_to_retry = 1
            print "fail: %s" % puterror
    else:
        ignore_nonce = 1
        print "same."

    if ignore_nonce:
        print_unlock_message(unlock(court, casenum, ignore_nonce=1))
    else:
        print_unlock_message(unlock(court, casenum, modified=docket_modified))

    if need_to_retry:
        add_to_retry(casenum)
        return False
    else:
        return True
Ejemplo n.º 15
0
def process_case(casenum):

    # Setup: Grab the lock.
    got_lock, nonce_or_message = lock(court, casenum)

    if got_lock:
        print "got the lock: %s" % (nonce_or_message)
        nonce = nonce_or_message
    else:
        print "could not get lock: %s" % (nonce_or_message)
        add_to_retry(casenum)
        return False

    casedir = "%s/%s" % (dirarg, casenum)

    # Step 1: Parse the docket.html file.
    try:
        docketpath = "%s/docket.html" % casedir
        docketfile = open(docketpath)
        docketbits = docketfile.read()
        docketfile.close()
    except IOError:
        reason = "could not open local docket"
        print "***Skipping %s.%s: %s... " % (court, casenum, reason),
        print_unlock_message(unlock(court, casenum, False))
        del_from_retry(casenum)
        add_to_failed(casenum, reason)
        return False
    else:
        docket = ParsePacer.parse_histdocqry(docketbits, court, casenum)

    if not docket:
        reason = "could not parse local docket"
        print "***Skipping %s.%s: %s... " % (court, casenum, reason),
        print_unlock_message(unlock(court, casenum, False))
        del_from_retry(casenum)
        add_to_failed(casenum, reason)
        return False

    # Step 1a: Try to fetch the the existing IA docket.
    ia_docket = None
    ia_docket_orig_string = ""
    ia_casemeta_orig_hash = ""

    ia_docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if ia_docketstring:

        # Got the existing docket-- parse it.
        ia_docket, parseerror = DocketXML.parse_xml_string(ia_docketstring)
        if not ia_docket:
            reason = "could not parse IA docket: %s" % (parseerror)
            print "***Skipping %s.%s: %s... " % (court, casenum, reason),
            print_unlock_message(unlock(court, casenum, False))
            del_from_retry(casenum)
            add_to_failed(casenum, reason)
            return False
        else:
            # Save the original docket hashes
            ia_docket_orig_string = ia_docketstring
            ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta))

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- ok.
        pass

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- either make_bucket failed or not yet ready.

        if casenum not in bucket_made:
            # If make_bucket failed, try make_bucket again.
            print "  make bucket...",
            make_bucket(casenum)

    elif fetcherror is IADirect.FETCH_TIMEOUT:
        # Couldn't contact IA, skip.
        print "***Skipping %s.%s: IA is down... " % (court, casenum),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    elif not ia_docketstring:
        # Unknown fetch error, skip.
        print "***Skipping %s.%s: unknown docket fetch error: %s..." % \
            (court, casenum, fetcherror),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    # Step 1b: If necessary, merge the two dockets.
    if ia_docket:
        ia_docket.merge_docket(docket)
    else:
        ia_docket = docket

    casedir_ls = os.listdir(casedir)

    index_ls = []
    pdf_ls = []
    for casedocname in casedir_ls:
        if casedocname.endswith("index.html"):
            index_ls.append(casedocname)
        elif casedocname.endswith(".pdf"):
            pdf_ls.append(casedocname)

    # Step 2: Parse each index file
    for indexname in index_ls:

        try:
            indexpath = "%s/%s" % (casedir, indexname)
            indexfile = open(indexpath)
            indexbits = indexfile.read()
            indexfile.close()
        except IOError:
            print "***Could not open file '%s'" % indexpath
            continue

        docnum = indexname.strip("-index.html")
        index_docket = ParsePacer.parse_doc1(indexbits, court,
                                             casenum, docnum)
        # Merge this docket into the IA docket
        ia_docket.merge_docket(index_docket)

    # Set initial flag for retrying this case.
    need_to_retry = 0

    # Step 3: Wait for the bucket to be ready
    bucketready = False
    for checkcount in xrange(20):
        bucketready, code = IADirect.check_bucket_ready(court, casenum)
        if bucketready:
            break
        else:
            # Wait 5 seconds and try again.
            time.sleep(5)

    if not bucketready:
        print "***Skipping %s.%s: bucket is not ready... " \
            % (court, casenum),
        print_unlock_message(unlock(court, casenum, False))
        add_to_retry(casenum)
        return False

    # Step 4: Upload each pdf file.
    doccount = 0
    for pdfname in pdf_ls:
        doccount += 1

        print "  uploading document %d/%d..." % (doccount, len(pdf_ls)),

        try:
            pdfpath = "%s/%s" % (casedir, pdfname)
            pdffile = open(pdfpath)
            pdfbits = pdffile.read()
            pdffile.close()
        except IOError:
            print "***Could not open file '%s'" % pdfpath
            continue

        pdfname = pdfname.strip(".pdf")
        split = pdfname.split("-")
        try:
            docnum = unicode(int(split[0]))
        except ValueError:
            # Not an integer.
            print "***Docnum not an integer '%s'" % pdfpath
            continue

        try:
            # converting v3->v4 subdocnums
            subdocnum = unicode(int(split[1]) - 1)
        except IndexError:
            subdocnum = "0"

        doc_docket = DocketXML.make_docket_for_pdf(pdfbits, court,
                                                   casenum, docnum,
                                                   subdocnum)
        doc_meta = doc_docket.get_document_metadict(docnum, subdocnum)

        # Only upload the PDF if the hash doesn't match the one in IA.
        ia_pdfhash = ia_docket.get_document_sha1(docnum, subdocnum)
        pdfhash = doc_docket.get_document_sha1(docnum, subdocnum)

        if ia_pdfhash != pdfhash:
            pdfstatus, pdferror = \
                IADirect.put_pdf(pdfbits, court, casenum,
                                 docnum, subdocnum, doc_meta)

            if not pdfstatus:
                # PUT failed, mark document as unavailable
                doc_docket.set_document_available(docnum, subdocnum, "0")
                print " fail: %s" % pdferror
                need_to_retry = True
                continue
            else:
                print "done."

            # Add this document's metadata into the ia_docket
            ia_docket.merge_docket(doc_docket)

        else:
            print "same."


    # Step 5: Push the docket to IA, if things have changed.
    print "  docket upload...",

    docket_modified = 0
    ignore_nonce = 0
    ia_docket_merged_string = ia_docket.to_xml()

    if ia_docket_orig_string != ia_docket_merged_string:

        # Assign the docket the new nonce from the lock
        ia_docket.nonce = nonce

        ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta))
        casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash

        putstatus, puterror = \
            IADirect.put_docket(ia_docket, court, casenum,
                                casemeta_diff=casemeta_diff)

        if putstatus:
            docket_modified = 1
            print "done."
        else:
            need_to_retry = 1
            print "fail: %s" % puterror
    else:
        ignore_nonce = 1
        print "same."

    if ignore_nonce:
        print_unlock_message(unlock(court, casenum, ignore_nonce=1))
    else:
        print_unlock_message(unlock(court, casenum, modified=docket_modified))

    if need_to_retry:
        add_to_retry(casenum)
        return False
    else:
        return True
Ejemplo n.º 16
0
def handle_pdf(filebits, court, url):
    """ Write PDF file metadata into the database. """

    # Parse coerced docid out of url
    try:
        docid = docid_from_url_name(url)
    except ValueError:
        logging.warning("handle_pdf: no url available to get docid")
        return "upload: pdf failed. no url supplied."

    # Lookup based on docid b/c it's the only metadata we have
    #  Document exists if we've previously parsed the case's docket
    query = Document.objects.filter(docid=docid)
    try:
        doc = query[0]
    except IndexError:
        logging.info("handle_pdf: haven't yet seen docket %s" % (docid))
        return "upload: pdf ignored."
    else:
        # Sanity check
        if doc.court != court:
            logging.error("handle_pdf: court mismatch (%s, %s) %s" %
                          (court, doc.court, url))
            return "upload: pdf metadata mismatch."

        casenum = doc.casenum
        docnum = doc.docnum
        subdocnum = doc.subdocnum
        sha1 = doc.sha1

    # Docket with updated sha1, available, and upload_date
    docket = DocketXML.make_docket_for_pdf(filebits, court, casenum,
                                           docnum, subdocnum, available=0)
    DocumentManager.update_local_db(docket)

    if docket.get_document_sha1(docnum ,subdocnum) != sha1:

        # Upload the file -- either doesn't exist on IA or has different sha1

        # Gather all the additional metadata we have
        #   - from the docket we just made
        doc_meta = docket.get_document_metadict(docnum, subdocnum)
        #   - from the database, if available
        if doc.docid:
            doc_meta["pacer_doc_id"] = doc.docid
        if doc.de_seq_num:
            doc_meta["pacer_de_seq_num"] = doc.de_seq_num
        if doc.dm_id:
            doc_meta["pacer_dm_id"] = doc.dm_id

        # Push the file to IA
        IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta)

    # Whether we uploaded the file, push the docket update to IA.
    do_me_up(docket)

    logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum,
                                                           docnum, subdocnum))
    message = "pdf uploaded."

    response = {}
    response["message"] = message
    jsonout = simplejson.dumps(response)

    return jsonout
Ejemplo n.º 17
0
def _cron_process_docketXML(docket, ppentry):
    """ Required to have the lock. """

    court = docket.casemeta["court"]
    casenum = docket.casemeta["pacer_case_num"]

    # Force '0' in the XML on docs that failed to upload.
    _update_docs_availability(docket)

    # The docket filename
    docketname = IACommon.get_docketxml_name(court, casenum)

    # Step 1: Try to fetch the existing docket from IA
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if docketstring:
        # Got the existing docket-- put merged docket file.
        ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring)

        if ia_docket:
            put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry)

            print "  %s %s" % (docketname, put_msg)
        else:
            print "  %s docket parsing error: %s" % (docketname, parse_msg)

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry)

        print "  %s put into existing bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- make the bucket and put a new docket file.
        put_result, put_msg = put_docket(docket,
                                         court,
                                         casenum,
                                         ppentry,
                                         newbucket=1)

        print "  %s put into new bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_URLERROR:
        # Couldn't get the IA docket

        # Unset the processing flag for later
        # ppentry.processing = 0
        #        ppentry.save()
        # Leave the pickle file for later
        # Drop Lock Here?

        print "  %s timed out.  wait for next cron." % (docketname)

    else:
        # Unknown fetch error.

        # Unset the processing flag for later
        # ppentry.processing = 0
        #        ppentry.save()
        # Drop Lock Here?

        # Leave the pickle file for later
        print "  %s unknown fetch error.  wait for next cron." % (docketname)
Ejemplo n.º 18
0
def handle_pdf(filebits, court, url, team_name):
    """ Write PDF file metadata into the database. """

    # Parse coerced docid out of url
    try:
        docid = docid_from_url_name(url)
    except ValueError:
        logging.warning("handle_pdf: no url available to get docid")
        return "upload: pdf failed. no url supplied."

    # Lookup based on docid b/c it's the only metadata we have
    # Document exists if we've previously parsed the case's docket
    query = Document.objects.filter(docid=docid)
    try:
        doc = query[0]
    except IndexError:
        logging.info("handle_pdf: haven't yet seen docket %s" % docid)
        return "upload: pdf ignored because we don't have docket %s" % docid
    else:
        # Sanity check
        if doc.court != court:
            logging.error("handle_pdf: court mismatch (%s, %s) %s" %
                          (court, doc.court, url))
            return "upload: pdf metadata mismatch."

        casenum = doc.casenum
        docnum = doc.docnum
        subdocnum = doc.subdocnum
        sha1 = doc.sha1

    # Docket with updated sha1, available, and upload_date
    docket = DocketXML.make_docket_for_pdf(filebits, court, casenum,
                                           docnum, subdocnum, available=0)
    DocumentManager.update_local_db(docket, team_name=team_name)

    if docket.get_document_sha1(docnum, subdocnum) != sha1:

        # Upload the file -- either doesn't exist on IA or has different sha1

        # Gather all the additional metadata we have
        #   - from the docket we just made
        doc_meta = docket.get_document_metadict(docnum, subdocnum)
        #   - from the database, if available
        if doc.docid:
            doc_meta["pacer_doc_id"] = doc.docid
        if doc.de_seq_num:
            doc_meta["pacer_de_seq_num"] = doc.de_seq_num
        if doc.dm_id:
            doc_meta["pacer_dm_id"] = doc.dm_id

        # Push the file to IA
        IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta)

    # Whether we uploaded the file, push the docket update to IA.
    do_me_up(docket)

    logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum,
                                                           docnum, subdocnum))
    message = "pdf uploaded."

    response = {"message": message}

    return simplejson.dumps(response)