def upload_docket(docket, nonce): """Case should be locked prior to this method""" ia_docket, message = _get_docket_from_IA(docket) if ia_docket: docket.merge_docket(ia_docket) # Don't upload if nothing has changed if docket == ia_docket: return True, 'Unmodified' docket.nonce = nonce #TK: Check that it's okay to always request a new bucket made request = IACommon.make_docketxml_request(docket.to_xml(), docket.get_court(), docket.get_casenum(), docket.casemeta, makenew=True) success, msg = _post_request(request) if not success: logger.error('XML Docket upload for %s.%s failed: %s', docket.get_court(), docket.get_casenum(), msg) return False, msg logger.info('XML Docket upload for %s.%s succeeded', docket.get_court(), docket.get_casenum()) # TK: Maybe handle this in a separate function that can deal with html? # Assuming this is sucessful, also upload an update to the html page request = IACommon.make_dockethtml_request(docket.to_html(), docket.get_court(), docket.get_casenum(), docket.casemeta) success, msg = _post_request(request) if not success: logger.error('HTML Docket upload for %s.%s failed: %s', docket.get_court(), docket.get_casenum(), msg) return False, msg logger.info('HTML Docket upload for %s.%s succeeded', docket.get_court(), docket.get_casenum()) return success, msg
def delete_documents_from_docket(court, casenum, documents): # Step 1: Get docket and convert into DocketXML docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if not docketstring: print "Could not find docket on IA, exiting...." exit() ia_docket, message = DocketXML.parse_xml_string(docketstring) if not ia_docket: print "Docket parsing error: %s.%s, exiting...." % (court, casenum) exit() # Step 2: Remove documents from DocketXML object for document in documents: ia_docket.remove_document(document.docnum, document.subdocnum) # Step 3: upload modified xml docketbits = ia_docket.to_xml() request = IACommon.make_docketxml_request(docketbits, court, casenum, ia_docket.casemeta) success_status = False try: response = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code == 201 or e.code == 200: # 201 Created: Success! print "Updated %s %s docket.xml" % (court, casenum) success_status = True
def delete_document_from_IA(document): request = IACommon.make_pdf_delete_request(document.court, document.casenum, document.docnum, document.subdocnum) try: response = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code != 204: print " the response to the delete request was %s. This may not be an error" % e.code
def delete_docket_xml_from_IA(court, casenum): request = IACommon.make_docketxml_delete_request(court, casenum) try: response = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code != 204: print " the response to the delete request was %s. This may not be an error" % e.code
def add_document_to_blacklist(document): BLACKLIST_PATH = "../blacklist" f = open(BLACKLIST_PATH, "a") f.write(IACommon.get_pdfname(document.court, document.casenum, document.docnum, document.subdocnum) + "\n") f.close() print " added document to %s, you may want to add a comment in that file" % BLACKLIST_PATH
def _cron_process_docketXML(docket, ppentry): ''' Required to have the lock. ''' court = docket.casemeta["court"] casenum = docket.casemeta["pacer_case_num"] # Force '0' in the XML on docs that failed to upload. _update_docs_availability(docket) # The docket filename docketname = IACommon.get_docketxml_name(court, casenum) # Step 1: Try to fetch the existing docket from IA docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if docketstring: # Got the existing docket-- put merged docket file. ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring) if ia_docket: put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry) print " %s %s" % (docketname, put_msg) else: print " %s docket parsing error: %s" % (docketname, parse_msg) elif fetcherror is IADirect.FETCH_NO_FILE: # Bucket exists but no docket-- put a new docket file. put_result, put_msg = put_docket(docket, court, casenum, ppentry) print " %s put into existing bucket: %s" % (docketname, put_msg) elif fetcherror is IADirect.FETCH_NO_BUCKET: # Bucket doesn't exist-- make the bucket and put a new docket file. put_result, put_msg = put_docket(docket, court, casenum, ppentry, newbucket=1) print " %s put into new bucket: %s" % (docketname, put_msg) elif fetcherror is IADirect.FETCH_URLERROR: # Couldn't get the IA docket # Unset the processing flag for later # ppentry.processing = 0 # ppentry.save() # Leave the pickle file for later # Drop Lock Here? print " %s timed out. wait for next cron." % (docketname) else: # Unknown fetch error. # Unset the processing flag for later # ppentry.processing = 0 # ppentry.save() # Drop Lock Here? # Leave the pickle file for later print " %s unknown fetch error. wait for next cron." % (docketname)
def archive_docket_xml_locally(court, casenum, directory = "archived_dockets"): docket_url = IACommon.get_docketxml_url(court, casenum) if os.system("wget --quiet --directory-prefix=%s %s" % (directory, docket_url)) != 0: print "Could not archive this docket, exiting without trying to delete..." exit() print " saved docket %s.%s for analysis in %s directory" % (court, casenum, directory)
def archive_document_locally(document, directory="blacklisted_documents"): doc_url = IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum) if os.system("wget --quiet --directory-prefix=%s %s" % (directory, doc_url)) != 0: print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % (document.court, document.casenum, document.docnum, document.subdocnum) exit() print " saved document %s.%s for analysis in %s directory" % (document.docnum, document.subdocnum, directory)
def add_document_to_blacklist(document): BLACKLIST_PATH = "../blacklist" f = open(BLACKLIST_PATH, "a") f.write( IACommon.get_pdfname(document.court, document.casenum, document.docnum, document.subdocnum) + "\n") f.close() print " added document to %s, you may want to add a comment in that file" % BLACKLIST_PATH
def check_bucket_ready(court, casenum): bucketurl = IACommon.get_bucketcheck_url(court, casenum) request = urllib2.Request(bucketurl) try: response = opener.open(request) except urllib2.HTTPError, e: # HTTP Error # No bucket exists, probably a 404 code. return False, int(e.code)
def _cron_process_PDF(obj, ppentry): filename = ppentry.filename meta = IACommon.get_meta_from_filename(filename) court = meta["court"] casenum = meta["casenum"] docnum = meta["docnum"] subdocnum = meta["subdocnum"] invalid_PDF = _is_invalid_pdf(obj, filename) # We only want to check for ssns on valid PDFs # PyPdf doesn't deal well with bad input if not invalid_PDF: # SSN privacy check has_ssn = _has_ssn(obj, filename) else: has_ssn = False # Blacklist file check in_blacklist = _in_blacklist(filename) if invalid_PDF or has_ssn or in_blacklist: docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=0) UploadHandler.do_me_up(docket) # Delete the entry from the DB ppentry.delete() # Quarantine the pickle file for analysis _quarantine_pickle(filename, ssn=has_ssn, blacklist_file=in_blacklist, invalid_PDF=invalid_PDF) return put_result, put_msg = _dispatch_put(obj, ppentry) if put_result: # Put success-- mark this document as available in the DB DocumentManager.mark_as_available(filename) docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=1) UploadHandler.do_me_up(docket) print " %s %s" % (filename, put_msg)
def upload_document(pdfbits, court, casenum, docnum, subdocnum): logger.info(' Uploading document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum)) request = IACommon.make_pdf_request(pdfbits, court, casenum, docnum, subdocnum, metadict = {}, makenew=True) success, msg = _post_request(request) if not success: logger.error(' Failed to upload document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum)) return False, msg logger.info(' Uploaded document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum)) return success, msg
def archive_document_locally(document, directory="blacklisted_documents"): doc_url = IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum) if os.system("wget --quiet --directory-prefix=%s %s" % ( directory, doc_url)) != 0: print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % ( document.court, document.casenum, document.docnum, document.subdocnum) exit() print " saved document %s.%s for analysis in %s directory" % ( document.docnum, document.subdocnum, directory)
def put_file(filebits, court, casenum, docnum, subdocnum, metadict={}): """ PUT the file into a new Internet Archive bucket. """ request = IACommon.make_pdf_request(filebits, court, casenum, docnum, subdocnum, metadict) # If this file is already scheduled, drop this. # TK: what we want? filename = IACommon.get_pdfname(court, casenum, docnum, subdocnum) query = PickledPut.objects.filter(filename=filename) if query: logging.info("put_file: same file already pickled. %s" % filename) return "IA PUT failed: the same file is already in the pickle bucket." # Add a PickledPut DB entry to schedule the PUT, not yet ready ppentry = PickledPut(filename=filename) # Fix a race case? try: ppentry.save() except IntegrityError: logging.info("put_file: same file already pickled. %s" % filename) return "IA PUT failed: the same file is already in the pickle bucket." # Pickle the request object into the jar pickle_success, message = pickle_object(request, filename) if pickle_success: # PickledPut now ready for processing. ppentry.ready = 1 ppentry.save() logging.info("put_file: ready. %s" % filename) else: # Could not pickle object, so remove from DB logging.warning("put_file: could not pickle PDF. %s" % filename) ppentry.delete() return message
def delete_document_from_IA(document): request = IACommon.make_pdf_delete_request(document.court, document.casenum, document.docnum, document.subdocnum) print request print request.get_full_url() print request.get_method() print dir(request) try: response = urllib2.urlopen(request) print response except urllib2.HTTPError, e: if e.code != 204: print " the response to the delete request was %s. This may not be an error" % e.code print " response: %s" % e.code
def put_docket(docket, court, casenum, casemeta_diff=1): docketbits = docket.to_xml() request = IACommon.make_docketxml_request(docketbits, court, casenum, docket.casemeta) put_result, put_msg = _dispatch_direct_put(request) if put_result: cleanup_docket_put(court, casenum, docket, metadiff=casemeta_diff) return put_result, put_msg
def _cron_process_PDF(obj, ppentry): filename = ppentry.filename meta = IACommon.get_meta_from_filename(filename) court = meta["court"] casenum = meta["casenum"] docnum = meta["docnum"] subdocnum = meta["subdocnum"] invalid_PDF = _is_invalid_pdf(obj, filename) # We only want to check for ssns on valid PDFs # PyPdf doesn't deal well with bad input if not invalid_PDF: # SSN privacy check has_ssn = _has_ssn(obj, filename) else: has_ssn = False # Blacklist file check in_blacklist = _in_blacklist(filename) if invalid_PDF or has_ssn or in_blacklist: docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=0) UploadHandler.do_me_up(docket) # Delete the entry from the DB ppentry.delete() # Quarantine the pickle file for analysis _quarantine_pickle(filename, ssn=has_ssn, blacklist_file= in_blacklist, invalid_PDF= invalid_PDF) return put_result, put_msg = _dispatch_put(obj, ppentry) if put_result: # Put success-- mark this document as available in the DB DocumentManager.mark_as_available(filename) docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=1) UploadHandler.do_me_up(docket) print " %s %s" % (filename, put_msg)
def mark_as_available(filename): docmeta = IACommon.get_meta_from_filename(filename) docquery = Document.objects.filter( court=docmeta["court"], casenum=docmeta["casenum"], docnum=docmeta["docnum"], subdocnum=docmeta["subdocnum"] ) try: docentry = docquery[0] except IndexError: # Unexpected case. No Document entry logging.error("mark_as_available: no entry for %s." % (filename)) else: docentry.available = 1 try: docentry.save() except IntegrityError: logging.error("mark_as_available: could not save %s." % (filename))
def mark_as_available(filename): docmeta = IACommon.get_meta_from_filename(filename) docquery = Document.objects.filter(court=docmeta["court"], casenum=docmeta["casenum"], docnum=docmeta["docnum"], subdocnum=docmeta["subdocnum"]) try: docentry = docquery[0] except IndexError: # Unexpected case. No Document entry logging.error("mark_as_available: no entry for %s." % (filename)) else: docentry.available = 1 try: docentry.save() except IntegrityError: logging.error("mark_as_available: could not save %s." % (filename))
def get_docket_string(court, casenum): docketurl = IACommon.get_docketxml_url(court, casenum) request = urllib2.Request(docketurl) try: response = opener.open(request) except urllib2.HTTPError, e: # HTTP Error if e.code == 404: bits = e.read() # IA returns different 404 pages if the bucket exists or not # This might be a brittle way to check the difference, but don't think there's a better way if(bits.find(NO_BUCKET_HTML_MESSAGE) > 0): return None, FETCH_NO_BUCKET # Otherwise, assume the bucket exists return None, FETCH_NO_FILE else: logging.info("get_docket_string: unknown fetch code %d" % e.code) return None, FETCH_UNKNOWN
def get_docket_string(court, casenum): docketurl = IACommon.get_docketxml_url(court, casenum) request = urllib2.Request(docketurl) try: response = opener.open(request) except urllib2.HTTPError, e: # HTTP Error if e.code == 404: bits = e.read() # IA returns different 404 pages if the bucket exists or not # This might be a brittle way to check the difference, but don't think there's a better way if (bits.find(NO_BUCKET_HTML_MESSAGE) > 0): return None, FETCH_NO_BUCKET # Otherwise, assume the bucket exists return None, FETCH_NO_FILE else: logging.info("get_docket_string: unknown fetch code %d" % e.code) return None, FETCH_UNKNOWN
def put_docket(docket, court, casenum, ppentry, newbucket=0, casemeta_diff=1): # Put the docket to IA docketbits = docket.to_xml() request = IACommon.make_docketxml_request(docketbits, court, casenum, docket.casemeta, newbucket) put_result, put_msg = _dispatch_put(request, ppentry) if put_result: html_put_msg = IADirect.cleanup_docket_put(court, casenum, docket, metadiff=casemeta_diff) print " gov.uscourts.%s.%s.docket.html upload: %s" % ( court, unicode(casenum), html_put_msg) DocumentManager.update_local_db(docket) return put_result, put_msg
def put_docket(docket, court, casenum, ppentry, newbucket=0, casemeta_diff=1): # Put the docket to IA docketbits = docket.to_xml() request = IACommon.make_docketxml_request(docketbits, court, casenum, docket.casemeta, newbucket) put_result, put_msg = _dispatch_put(request, ppentry) if put_result: html_put_msg = IADirect.cleanup_docket_put(court, casenum, docket, metadiff=casemeta_diff) print " gov.uscourts.%s.%s.docket.html upload: %s" % (court, unicode( casenum), html_put_msg) DocumentManager.update_local_db(docket) return put_result, put_msg
def _get_documents_dict(court, casenum): """ Create a dict containing the info for the docs specified """ documents = {} query = Document.objects.filter(court=court, casenum=casenum) if query: for document in query: if document.docid: docmeta = {"casenum": document.casenum, "docnum": document.docnum, "subdocnum": document.subdocnum} if document.available: docmeta.update({"filename": IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum), "timestamp": document.lastdate.strftime("%m/%d/%y")}) documents[document.docid] = docmeta return documents
def _get_documents_dict(court, casenum): """ Create a dict containing the info for the docs specified """ documents = {} query = Document.objects.filter(court=court, casenum=casenum) if query: for document in query: if document.docid: docmeta = {"casenum": document.casenum, "docnum": document.docnum, "subdocnum": document.subdocnum} if document.available: docmeta.update( {"filename": IACommon.get_pdf_url(document.court, document.casenum, document.docnum, document.subdocnum), "timestamp": document.lastdate.strftime("%m/%d/%y")}) documents[document.docid] = docmeta return documents
def put_pdf(filebits, court, casenum, docnum, subdocnum, metadict={}): """ PUT the file into a new Internet Archive bucket. """ request = IACommon.make_pdf_request(filebits, court, casenum, docnum, subdocnum, metadict) return _dispatch_direct_put(request)
yesterday = datetime.datetime.now() - datetime.timedelta(1) old_or_avail_query = doc_query.filter(available=1) | \ doc_query.filter(modified__lte=yesterday) query = None try: query = old_or_avail_query[0] except IndexError: try: query = doc_query[0] except IndexError: query = None else: ppquery = PickledPut.objects.filter( filename=IACommon.get_docketxml_name(court, casenum)) if len(ppquery) > 0: query = None if query: try: # we only have a last date for documents that have been uploaded date = query.lastdate.strftime("%m/%d/%y") except AttributeError: try: date = query.modified.strftime("%m/%d/%y") except AttributeError: date = "Unknown" response = { "docket_url": IACommon.get_dockethtml_url(court, casenum),
def put_dockethtml(court, casenum, docket): dockethtml = docket.to_html() request = IACommon.make_dockethtml_request(dockethtml, court, casenum, docket.casemeta) return _dispatch_direct_put(request)
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)", url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y") } if query.subdocnum == 0: subquery = Document.objects.filter( court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude(subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename": IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y") } jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)",url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y")} if query.subdocnum == 0: subquery = Document.objects.filter(court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude( subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename" : IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y")} jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")
def do_me_up(docket): ''' Download, merge and update the docket with IA. ''' # Pickle this object for do_me_up by the cron process. court = docket.get_court() casenum = docket.get_casenum() docketname = IACommon.get_docketxml_name(court, casenum) # Check if this docket is already scheduled to be processed. query = PickledPut.objects.filter(filename=docketname) try: ppentry = query[0] except IndexError: # Not already scheduled, so schedule it now. ppentry = PickledPut(filename=docketname, docket=1) try: ppentry.save() except IntegrityError: # Try again. do_me_up(docket) else: # Pickle this object. pickle_success, msg = IA.pickle_object(docket, docketname) if pickle_success: # Ready for processing. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: ready. %s" % (docketname)) else: # Pickle failed, remove from DB. ppentry.delete() logging.error("do_me_up: %s %s" % (msg, docketname)) else: # Already scheduled. # If there is a lock for this case, it's being uploaded. Don't merge now locked = BucketLockManager.lock_exists(court, casenum) if ppentry.ready and not locked: # Docket is waiting to be processed by cron job. # Revert state back to 'not ready' so we can do local merge. ppentry.ready = 0 ppentry.save() # Fetch and unpickle the waiting docket. prev_docket, unpickle_msg = IA.unpickle_object(docketname) if prev_docket: # Do the local merge. prev_docket.merge_docket(docket) # Pickle it back pickle_success, pickle_msg = \ IA.pickle_object(prev_docket, docketname) if pickle_success: # Merged and ready. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: merged and ready. %s" %(docketname)) else: # Re-pickle failed, delete. ppentry.delete() logging.error("do_me_up: re-%s %s" % (pickle_msg, docketname)) else: # Unpickle failed ppentry.delete() IA.delete_pickle(docketname) logging.error("do_me_up: %s %s" % (unpickle_msg, docketname)) # Ignore if in any of the other three possible state... # because another cron job is already doing work on this entity # Don't delete DB entry or pickle file. elif ppentry.ready and locked: pass #logging.debug("do_me_up: %s discarded, processing conflict." % # (docketname)) elif not ppentry.ready and not locked: pass #logging.debug("do_me_up: %s discarded, preparation conflict." % # (docketname)) else: logging.error("do_me_up: %s discarded, inconsistent state." % (docketname))
def make_new_bucket(court, casenum): request = IACommon.make_bucket_request(court, casenum, makenew=1) return _dispatch_direct_put(request)
def _cron_put_pickles(): # Get uploader credentials. uploader_query = Uploader.objects.filter(key=AUTH_KEY) try: RECAP_UPLOADER_ID = uploader_query[0].id except IndexError: print " could not find uploader with key=%s" % AUTH_KEY return # Get all ready pickles query = PickledPut.objects.filter(ready=1, processing=0) \ .order_by('-filename') # Set all ready pickles to the processing state # for ppentry in query: # ppentry.processing = 1 # ppentry.save() # Keep track of court, casenum. Only lock and unlock once for each case. curr_court = None curr_casenum = None lock_nonce = None # Process pickles one at a time. for ppentry in query: filename = ppentry.filename ppmeta = IACommon.get_meta_from_filename(filename) court = ppmeta["court"] casenum = ppmeta["casenum"] # Make sure we have the lock for this case. if curr_court == court and curr_casenum == casenum: # Same case as the previous ppentry. if not lock_nonce: # Skip if we don't have the lock already. # ppentry.processing = 0 # ppentry.save() continue # Otherwise, we already have the lock, so continue. else: # Switching to a new case. # Drop the current lock (from previous case), if necessary. if curr_court and curr_casenum: dropped, errmsg = BucketLockManager.drop_lock( curr_court, curr_casenum, RECAP_UPLOADER_ID, nolocaldb=1) if not dropped: print " %s.%s someone stole my lock?" % \ (court, unicode(casenum)) # Grab new lock curr_court = court curr_casenum = casenum lock_nonce, errmsg = BucketLockManager.get_lock(court, casenum, RECAP_UPLOADER_ID, one_per_uploader=1) if not lock_nonce: print " Passing on %s.%s: %s" % (court, casenum, errmsg) if not lock_nonce or lock_nonce == 'bigdoc': # We don't have a lock, so don't drop the lock in the next loop curr_court = None curr_casenum = None continue # We'll always have the lock here. # Unpickle the object obj, unpickle_msg = unpickle_object(filename) # Two cases for the unpickled object: Request or DocketXML if obj and ppentry.docket: print "Processing docket: %s" % filename _cron_process_docketXML(obj, ppentry) elif obj: # Dispatch the PUT request _cron_process_PDF(obj, ppentry) else: # Unpickling failed # If unpickling fails, it could mean that another cron job # has already finished this PP - not sure how to distinguish this print " %s %s (Another cron job completed?)" % (filename, unpickle_msg) # Delete the entry from the DB ppentry.delete() # Delete the pickle file delete_pickle(filename) # Drop last lock if curr_court and curr_casenum: dropped, errmsg = BucketLockManager.drop_lock(curr_court, curr_casenum, RECAP_UPLOADER_ID, nolocaldb=1) if not dropped: print " %s.%s someone stole my lock??" % (court, unicode(casenum))
def put_casemeta(court, casenum, metadict={}): request = IACommon.make_casemeta_request(court, casenum, metadict) return _dispatch_direct_put(request)
def do_me_up(docket): """ Download, merge and update the docket with IA. """ # Pickle this object for do_me_up by the cron process. court = docket.get_court() casenum = docket.get_casenum() docketname = IACommon.get_docketxml_name(court, casenum) # Check if this docket is already scheduled to be processed. query = PickledPut.objects.filter(filename=docketname) try: ppentry = query[0] except IndexError: # Not already scheduled, so schedule it now. ppentry = PickledPut(filename=docketname, docket=1) try: ppentry.save() except IntegrityError: # Try again. do_me_up(docket) else: # Pickle this object. pickle_success, msg = IA.pickle_object(docket, docketname) if pickle_success: # Ready for processing. ppentry.ready = 1 ppentry.save() logging.info("do_me_up: ready. %s" % (docketname)) else: # Pickle failed, remove from DB. ppentry.delete() logging.error("do_me_up: %s %s" % (msg, docketname)) else: # Already scheduled. # If there is a lock for this case, it's being uploaded. Don't merge now locked = BucketLockManager.lock_exists(court, casenum) if ppentry.ready and not locked: # Docket is waiting to be processed by cron job. # Revert state back to 'not ready' so we can do local merge. ppentry.ready = 0 ppentry.save() # Fetch and unpickle the waiting docket. prev_docket, unpickle_msg = IA.unpickle_object(docketname) if prev_docket: # Do the local merge. prev_docket.merge_docket(docket) # Pickle it back pickle_success, pickle_msg = \ IA.pickle_object(prev_docket, docketname) if pickle_success: # Merged and ready. ppentry.ready = 1 ppentry.save() logging.info( "do_me_up: merged and ready. %s" % (docketname)) else: # Re-pickle failed, delete. ppentry.delete() logging.error("do_me_up: re-%s %s" % (pickle_msg, docketname)) else: # Unpickle failed ppentry.delete() IA.delete_pickle(docketname) logging.error("do_me_up: %s %s" % (unpickle_msg, docketname)) # Ignore if in any of the other three possible state... # because another cron job is already doing work on this entity # Don't delete DB entry or pickle file. elif ppentry.ready and locked: pass #logging.debug("do_me_up: %s discarded, processing conflict." % # (docketname)) elif not ppentry.ready and not locked: pass #logging.debug("do_me_up: %s discarded, preparation conflict." % # (docketname)) else: logging.error("do_me_up: %s discarded, inconsistent state." % (docketname))
.order_by('-lastdate', '-modified') yesterday = datetime.datetime.now() - datetime.timedelta(1) old_or_avail_query = doc_query.filter(available=1) \ | doc_query.filter(modified__lte=yesterday) query = None try: query = old_or_avail_query[0] except IndexError: try: query = doc_query[0] except IndexError: query = None else: ppquery = PickledPut.objects.filter(filename=IACommon.get_docketxml_name(court, casenum)) if len(ppquery) > 0: query = None if query: try: # we only have a last date for documents that have been uploaded date = query.lastdate.strftime("%m/%d/%y") except AttributeError: try: date = query.modified.strftime("%m/%d/%y") except AttributeError: date = "Unknown"
def _cron_process_docketXML(docket, ppentry): """ Required to have the lock. """ court = docket.casemeta["court"] casenum = docket.casemeta["pacer_case_num"] # Force '0' in the XML on docs that failed to upload. _update_docs_availability(docket) # The docket filename docketname = IACommon.get_docketxml_name(court, casenum) # Step 1: Try to fetch the existing docket from IA docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if docketstring: # Got the existing docket-- put merged docket file. ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring) if ia_docket: put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry) print " %s %s" % (docketname, put_msg) else: print " %s docket parsing error: %s" % (docketname, parse_msg) elif fetcherror is IADirect.FETCH_NO_FILE: # Bucket exists but no docket-- put a new docket file. put_result, put_msg = put_docket(docket, court, casenum, ppentry) print " %s put into existing bucket: %s" % (docketname, put_msg) elif fetcherror is IADirect.FETCH_NO_BUCKET: # Bucket doesn't exist-- make the bucket and put a new docket file. put_result, put_msg = put_docket(docket, court, casenum, ppentry, newbucket=1) print " %s put into new bucket: %s" % (docketname, put_msg) elif fetcherror is IADirect.FETCH_URLERROR: # Couldn't get the IA docket # Unset the processing flag for later # ppentry.processing = 0 # ppentry.save() # Leave the pickle file for later # Drop Lock Here? print " %s timed out. wait for next cron." % (docketname) else: # Unknown fetch error. # Unset the processing flag for later # ppentry.processing = 0 # ppentry.save() # Drop Lock Here? # Leave the pickle file for later print " %s unknown fetch error. wait for next cron." % (docketname)
def _cron_put_pickles(): # Get uploader credentials. uploader_query = Uploader.objects.filter(key=AUTH_KEY) try: RECAP_UPLOADER_ID = uploader_query[0].id except IndexError: print " could not find uploader with key=%s" % AUTH_KEY return # Get all ready pickles query = PickledPut.objects.filter(ready=1, processing=0) \ .order_by('-filename') # Set all ready pickles to the processing state #for ppentry in query: # ppentry.processing = 1 # ppentry.save() # Keep track of court, casenum. Only lock and unlock once for each case. curr_court = None curr_casenum = None lock_nonce = None # Process pickles one at a time. for ppentry in query: filename = ppentry.filename ppmeta = IACommon.get_meta_from_filename(filename) court = ppmeta["court"] casenum = ppmeta["casenum"] # Make sure we have the lock for this case. if curr_court == court and curr_casenum == casenum: # Same case as the previous ppentry. if not lock_nonce: # Skip if we don't have the lock already. # ppentry.processing = 0 # ppentry.save() continue # Otherwise, we already have the lock, so continue. else: # Switching to a new case. # Drop the current lock (from previous case), if necessary. if curr_court and curr_casenum: dropped, errmsg = BucketLockManager.drop_lock(curr_court, curr_casenum, RECAP_UPLOADER_ID, nolocaldb=1) if not dropped: print " %s.%s someone stole my lock?" % \ (court, unicode(casenum)) # Grab new lock curr_court = court curr_casenum = casenum lock_nonce, errmsg = BucketLockManager.get_lock(court, casenum, RECAP_UPLOADER_ID, one_per_uploader=1) if not lock_nonce: print " Passing on %s.%s: %s" % (court, casenum, errmsg) # We don't have a lock, so don't drop the lock in the next loop curr_court = None curr_casenum = None continue # We'll always have the lock here. # Unpickle the object obj, unpickle_msg = unpickle_object(filename) # Two cases for the unpickled object: Request or DocketXML if obj and ppentry.docket: _cron_process_docketXML(obj, ppentry) elif obj: # Dispatch the PUT request _cron_process_PDF(obj, ppentry) else: # Unpickling failed # If unpickling fails, it could mean that another cron job # has already finished this PP - not sure how to distinguish this print " %s %s (Another cron job completed?)" % (filename, unpickle_msg) # Delete the entry from the DB ppentry.delete() # Delete the pickle file delete_pickle(filename) # Drop last lock if curr_court and curr_casenum: dropped, errmsg = BucketLockManager.drop_lock(curr_court, curr_casenum, RECAP_UPLOADER_ID, nolocaldb=1) if not dropped: print " %s.%s someone stole my lock??" % (court, unicode(casenum))
yesterday = datetime.datetime.now() - datetime.timedelta(1) old_or_avail_query = doc_query.filter(available=1) | \ doc_query.filter(modified__lte=yesterday) query = None try: query = old_or_avail_query[0] except IndexError: try: query = doc_query[0] except IndexError: query = None else: ppquery = PickledPut.objects.filter( filename=IACommon.get_docketxml_name(court, casenum)) if len(ppquery) > 0: query = None if query: try: # we only have a last date for documents that have been uploaded date = query.lastdate.strftime("%m/%d/%y") except AttributeError: try: date = query.modified.strftime("%m/%d/%y") except AttributeError: date = "Unknown" response = { "docket_url": IACommon.get_dockethtml_url(court,