def create_docket_pickles(path, court): #check if docket pickle directory exists and pickle generation has completed opinion_reports= glob.glob(os.path.join(path, "*.opinions.*")) try: os.mkdir(os.path.join(path, 'docket_pickles')) except OSError: #delete docket_pickles pass for report in opinion_reports: filebits = open(report).read() dockets = PP.parse_opinions(filebits, court) if dockets: print "Found %s dockets in %s " % (len(dockets), report) for docket in dockets: if len(docket.documents) != 1: raise "This docket has more than one document! docket text: " % docket doc = docket.documents.values()[0] filename = _get_docket_pickle_filename(court, doc['casenum'], doc['doc_num'], doc['attachment_num']) success, msg = IA.pickle_object(docket, filename, os.path.join(path, "docket_pickles")) if not success: print "Error pickling file %s: %s " % (filename, msg)
from InternetArchive import * collection_name = "coverartarchive" ia = InternetArchive(collection_name) ia.upload_collection()
def handle_pdf(filebits, court, url): """ Write PDF file metadata into the database. """ # Parse coerced docid out of url try: docid = docid_from_url_name(url) except ValueError: logging.warning("handle_pdf: no url available to get docid") return "upload: pdf failed. no url supplied." # Lookup based on docid b/c it's the only metadata we have # Document exists if we've previously parsed the case's docket query = Document.objects.filter(docid=docid) try: doc = query[0] except IndexError: logging.info("handle_pdf: haven't yet seen docket %s" % (docid)) return "upload: pdf ignored." else: # Sanity check if doc.court != court: logging.error("handle_pdf: court mismatch (%s, %s) %s" % (court, doc.court, url)) return "upload: pdf metadata mismatch." casenum = doc.casenum docnum = doc.docnum subdocnum = doc.subdocnum sha1 = doc.sha1 # Docket with updated sha1, available, and upload_date docket = DocketXML.make_docket_for_pdf(filebits, court, casenum, docnum, subdocnum, available=0) DocumentManager.update_local_db(docket) if docket.get_document_sha1(docnum ,subdocnum) != sha1: # Upload the file -- either doesn't exist on IA or has different sha1 # Gather all the additional metadata we have # - from the docket we just made doc_meta = docket.get_document_metadict(docnum, subdocnum) # - from the database, if available if doc.docid: doc_meta["pacer_doc_id"] = doc.docid if doc.de_seq_num: doc_meta["pacer_de_seq_num"] = doc.de_seq_num if doc.dm_id: doc_meta["pacer_dm_id"] = doc.dm_id # Push the file to IA IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta) # Whether we uploaded the file, push the docket update to IA. do_me_up(docket) logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum, docnum, subdocnum)) message = "pdf uploaded." response = {} response["message"] = message jsonout = simplejson.dumps(response) return jsonout
def do_me_up(docket): ''' Download, merge and update the docket with IA. ''' # Pickle this object for do_me_up by the cron process. court = docket.get_court() casenum = docket.get_casenum() docketname = IACommon.get_docketxml_name(court, casenum) # Check if this docket is already scheduled to be processed. query = PickledPut.objects.filter(filename=docketname) try: ppentry = query[0] except IndexError: # Not already scheduled, so schedule it now. ppentry = PickledPut(filename=docketname, docket=1) try: ppentry.save() except IntegrityError: # Try again. do_me_up(docket) else: # Pickle this object. pickle_success, msg = IA.pickle_object(docket, docketname) if pickle_success: # Ready for processing. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: ready. %s" % (docketname)) else: # Pickle failed, remove from DB. ppentry.delete() logging.error("do_me_up: %s %s" % (msg, docketname)) else: # Already scheduled. # If there is a lock for this case, it's being uploaded. Don't merge now locked = BucketLockManager.lock_exists(court, casenum) if ppentry.ready and not locked: # Docket is waiting to be processed by cron job. # Revert state back to 'not ready' so we can do local merge. ppentry.ready = 0 ppentry.save() # Fetch and unpickle the waiting docket. prev_docket, unpickle_msg = IA.unpickle_object(docketname) if prev_docket: # Do the local merge. prev_docket.merge_docket(docket) # Pickle it back pickle_success, pickle_msg = \ IA.pickle_object(prev_docket, docketname) if pickle_success: # Merged and ready. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: merged and ready. %s" %(docketname)) else: # Re-pickle failed, delete. ppentry.delete() logging.error("do_me_up: re-%s %s" % (pickle_msg, docketname)) else: # Unpickle failed ppentry.delete() IA.delete_pickle(docketname) logging.error("do_me_up: %s %s" % (unpickle_msg, docketname)) # Ignore if in any of the other three possible state... # because another cron job is already doing work on this entity # Don't delete DB entry or pickle file. elif ppentry.ready and locked: pass #logging.debug("do_me_up: %s discarded, processing conflict." % # (docketname)) elif not ppentry.ready and not locked: pass #logging.debug("do_me_up: %s discarded, preparation conflict." % # (docketname)) else: logging.error("do_me_up: %s discarded, inconsistent state." % (docketname))
def handle_pdf(filebits, court, url, team_name): """ Write PDF file metadata into the database. """ # Parse coerced docid out of url try: docid = docid_from_url_name(url) except ValueError: logging.warning("handle_pdf: no url available to get docid") return "upload: pdf failed. no url supplied." # Lookup based on docid b/c it's the only metadata we have # Document exists if we've previously parsed the case's docket query = Document.objects.filter(docid=docid) try: doc = query[0] except IndexError: logging.info("handle_pdf: haven't yet seen docket %s" % docid) return "upload: pdf ignored because we don't have docket %s" % docid else: # Sanity check if doc.court != court: logging.error("handle_pdf: court mismatch (%s, %s) %s" % (court, doc.court, url)) return "upload: pdf metadata mismatch." casenum = doc.casenum docnum = doc.docnum subdocnum = doc.subdocnum sha1 = doc.sha1 # Docket with updated sha1, available, and upload_date docket = DocketXML.make_docket_for_pdf(filebits, court, casenum, docnum, subdocnum, available=0) DocumentManager.update_local_db(docket, team_name=team_name) if docket.get_document_sha1(docnum, subdocnum) != sha1: # Upload the file -- either doesn't exist on IA or has different sha1 # Gather all the additional metadata we have # - from the docket we just made doc_meta = docket.get_document_metadict(docnum, subdocnum) # - from the database, if available if doc.docid: doc_meta["pacer_doc_id"] = doc.docid if doc.de_seq_num: doc_meta["pacer_de_seq_num"] = doc.de_seq_num if doc.dm_id: doc_meta["pacer_dm_id"] = doc.dm_id # Push the file to IA IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta) # Whether we uploaded the file, push the docket update to IA. do_me_up(docket) logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum, docnum, subdocnum)) message = "pdf uploaded." response = {"message": message} return simplejson.dumps(response)
def do_me_up(docket): """ Download, merge and update the docket with IA. """ # Pickle this object for do_me_up by the cron process. court = docket.get_court() casenum = docket.get_casenum() docketname = IACommon.get_docketxml_name(court, casenum) # Check if this docket is already scheduled to be processed. query = PickledPut.objects.filter(filename=docketname) try: ppentry = query[0] except IndexError: # Not already scheduled, so schedule it now. ppentry = PickledPut(filename=docketname, docket=1) try: ppentry.save() except IntegrityError: # Try again. do_me_up(docket) else: # Pickle this object. pickle_success, msg = IA.pickle_object(docket, docketname) if pickle_success: # Ready for processing. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: ready. %s" % (docketname)) else: # Pickle failed, remove from DB. ppentry.delete() logging.error("do_me_up: %s %s" % (msg, docketname)) else: # Already scheduled. # If there is a lock for this case, it's being uploaded. Don't merge now locked = BucketLockManager.lock_exists(court, casenum) if ppentry.ready and not locked: # Docket is waiting to be processed by cron job. # Revert state back to 'not ready' so we can do local merge. ppentry.ready = 0 ppentry.save() # Fetch and unpickle the waiting docket. prev_docket, unpickle_msg = IA.unpickle_object(docketname) if prev_docket: # Do the local merge. prev_docket.merge_docket(docket) # Pickle it back pickle_success, pickle_msg = \ IA.pickle_object(prev_docket, docketname) if pickle_success: # Merged and ready. ppentry.ready = 1 ppentry.save() logging.info( "do_me_up: merged and ready. %s" % (docketname)) else: # Re-pickle failed, delete. ppentry.delete() logging.error("do_me_up: re-%s %s" % (pickle_msg, docketname)) else: # Unpickle failed ppentry.delete() IA.delete_pickle(docketname) logging.error("do_me_up: %s %s" % (unpickle_msg, docketname)) # Ignore if in any of the other three possible state... # because another cron job is already doing work on this entity # Don't delete DB entry or pickle file. elif ppentry.ready and locked: pass #logging.debug("do_me_up: %s discarded, processing conflict." % # (docketname)) elif not ppentry.ready and not locked: pass #logging.debug("do_me_up: %s discarded, preparation conflict." % # (docketname)) else: logging.error("do_me_up: %s discarded, inconsistent state." % (docketname))
def _upload_document(path, court, document): filename = _get_docket_pickle_filename(court, document['casenum'], document['doc_num'], document['attachment_num']) docket, msg = IA.unpickle_object(filename, os.path.join(path, 'docket_pickles')) if not docket: return False, 'Could not unpickle: %s' % msg casenum = docket.get_casenum() got_lock, nonce_or_message = UM.lock(court, casenum) # We need to: grab a lock if got_lock: print "got the lock: %s" % (nonce_or_message) nonce = nonce_or_message else: return False, "could not get lock: %s" % (nonce_or_message) # Get the existing ia docket, if it exists ia_docket = None ia_docket_orig_string = "" ia_casemeta_orig_hash = "" ia_docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if ia_docketstring: # Got the existing docket-- parse it. ia_docket, parseerror = DocketXML.parse_xml_string(ia_docketstring) if not ia_docket: reason = "could not parse IA docket: %s" % (parseerror) UM.print_unlock_message(UM.unlock(court, casenum, False)) return False, "***Skipping %s.%s: %s... " % (court, casenum, reason), else: # Save the original docket hashes ia_docket_orig_string = ia_docketstring ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta)) print "There is a docket for %s, %s! " % (court, casenum) elif fetcherror is IADirect.FETCH_NO_FILE: # Bucket exists but no docket-- ok. pass elif fetcherror is IADirect.FETCH_NO_BUCKET: # Bucket doesn't exist-- either make_bucket failed or not yet ready. # That's okay, we'll make the bucket with the first upload #if casenum not in bucket_made: # If make_bucket failed, try make_bucket again. # print " make bucket...", # make_bucket(casenum) elif fetcherror is IADirect.FETCH_TIMEOUT: # Couldn't contact IA, skip. UM.print_unlock_message(UM.unlock(court, casenum, False)) #TK: Handle retry logic here? return False, "***Skipping %s.%s: IA is down... " % (court, casenum), elif not ia_docketstring: # Unknown fetch error, skip. UM.print_unlock_message(UM.unlock(court, casenum, False)) return False, "***Skipping %s.%s: unknown docket fetch error: %s..." % \ (court, casenum, fetcherror), # Step 1b: If necessary, merge the two dockets. if ia_docket: ia_docket.merge_docket(docket) else: ia_docket = docket # Upload the pdf #TK: add some better status updates here, maybe uploading doc 123 of 1234 print " uploading document %s.%s.%s..." % (court, casenum, document['doc_num']), try: doc_filename = os.path.join(path, document['docid'], ".pdf" pdfbits = open(doc_filename)).read() except IOError: UM.print_unlock_message(UM.unlock(court, casenum, False)) return False, " ***Could not open file %s " % doc_filename #TK: probably need to make the bucket before doing this doc_docket = DocketXML.make_docket_for_pdf(pdfbits, court, casenum, docnum, subdocnum) doc_meta = doc_docket.get_document_metadict(docnum, subdocnum) # Only upload the PDF if the hash doesn't match the one in IA. ia_pdfhash = ia_docket.get_document_sha1(docnum, subdocnum) pdfhash = doc_docket.get_document_sha1(docnum, subdocnum) if ia_pdfhash != pdfhash: pdfstatus, pdferror = \ IADirect.put_pdf(pdfbits, court, casenum, docnum, subdocnum, doc_meta) if not pdfstatus: # PUT failed, mark document as unavailable doc_docket.set_document_available(docnum, subdocnum, "0") # TK: handle retry here UM.print_unlock_message(UM.unlock(court, casenum, False)) return False, " fail: %s" % pdferror else: print "done." # Add this document's metadata into the ia_docket ia_docket.merge_docket(doc_docket) # Step 5: Push the docket to IA, if things have changed. print " docket upload...", docket_modified = 0 ignore_nonce = 0 ia_docket_merged_string = ia_docket.to_xml() if ia_docket_orig_string != ia_docket_merged_string: # Assign the docket the new nonce from the lock ia_docket.nonce = nonce ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta)) casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash putstatus, puterror = \ IADirect.put_docket(ia_docket, court, casenum, casemeta_diff=casemeta_diff) UM.print_unlock_message(UM.unlock(court, casenum, modified = False)) return True, "Document uploaded"