def _cron_fetch_update(lock): court = unicode(lock.court) casenum = unicode(lock.casenum) nonce = unicode(lock.nonce) docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if not docketstring: # Couldn't get the docket. Try again later. if nonce: BucketLockManager.try_lock_later(lock) else: lock.delete() print " %s.%s couldn't fetch the docket: %d" % (court, casenum, fetcherror) return ia_docket, message = DocketXML.parse_xml_string(docketstring) if not ia_docket: # Docket parsing error. if nonce: BucketLockManager.try_lock_later(lock) else: lock.delete() print " %s.%s docket parsing error: %s" % (court, casenum, message) return elif ia_docket.nonce == nonce or not nonce: # Got the docket and it is either: # 1. up-to-date (nonce match), or # 2. expired (ignore nonce) # In both scenarios, update the local DB. DocumentManager.update_local_db(ia_docket, ignore_available=0) print " %s.%s fetched and DB updated." % (court, casenum) ia_docket_orig_hash = hash(pickle.dumps(ia_docket)) local_docket = DocumentManager.create_docket_from_local_documents( court, casenum) if local_docket: ia_docket.merge_docket(local_docket) ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket)) if ia_docket_orig_hash != ia_docket_after_local_merge_hash: print " After fetch, some locally stored information was " \ "missing from %s.%s. Local info addition scheduled." % ( court, casenum) UploadHandler.do_me_up(ia_docket) # Remove the lock. lock.delete() else: # Got the docket but it is not update to date. Try again later. BucketLockManager.try_lock_later(lock) print " %s.%s fetched, wait more." % (court, casenum)
def _cron_fetch_update(lock): court = unicode(lock.court) casenum = unicode(lock.casenum) nonce = unicode(lock.nonce) docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if not docketstring: # Couldn't get the docket. Try again later. if nonce: BucketLockManager.try_lock_later(lock) else: lock.delete() print " %s.%s couldn't fetch the docket: %d" % (court, casenum, fetcherror) return ia_docket, message = DocketXML.parse_xml_string(docketstring) if not ia_docket: # Docket parsing error. if nonce: BucketLockManager.try_lock_later(lock) else: lock.delete() print " %s.%s docket parsing error: %s" % (court, casenum, message) return elif ia_docket.nonce == nonce or not nonce: # Got the docket and it is either: # 1. up-to-date (nonce match), or # 2. expired (ignore nonce) # In both scenarios, update the local DB. DocumentManager.update_local_db(ia_docket, ignore_available=0) print " %s.%s fetched and DB updated." % (court, casenum) ia_docket_orig_hash = hash(pickle.dumps(ia_docket)) local_docket = DocumentManager.create_docket_from_local_documents(court, casenum) if local_docket: ia_docket.merge_docket(local_docket) ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket)) if ia_docket_orig_hash != ia_docket_after_local_merge_hash: print " After fetch, some locally stored information was missing from %s.%s. Local info addition scheduled." % (court, casenum) UploadHandler.do_me_up(ia_docket) # Remove the lock. lock.delete() else: # Got the docket but it is not update to date. Try again later. BucketLockManager.try_lock_later(lock) print " %s.%s fetched, wait more." % (court, casenum)
def _cron_process_PDF(obj, ppentry): filename = ppentry.filename meta = IACommon.get_meta_from_filename(filename) court = meta["court"] casenum = meta["casenum"] docnum = meta["docnum"] subdocnum = meta["subdocnum"] invalid_PDF = _is_invalid_pdf(obj, filename) # We only want to check for ssns on valid PDFs # PyPdf doesn't deal well with bad input if not invalid_PDF: # SSN privacy check has_ssn = _has_ssn(obj, filename) else: has_ssn = False # Blacklist file check in_blacklist = _in_blacklist(filename) if invalid_PDF or has_ssn or in_blacklist: docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=0) UploadHandler.do_me_up(docket) # Delete the entry from the DB ppentry.delete() # Quarantine the pickle file for analysis _quarantine_pickle(filename, ssn=has_ssn, blacklist_file=in_blacklist, invalid_PDF=invalid_PDF) return put_result, put_msg = _dispatch_put(obj, ppentry) if put_result: # Put success-- mark this document as available in the DB DocumentManager.mark_as_available(filename) docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=1) UploadHandler.do_me_up(docket) print " %s %s" % (filename, put_msg)
def mark_document_as_unavailable(document): #if not document.available: # print "Exiting: This document isn't currently available on IA" # print usage() # exit() document.available = 0 document.lastdate = datetime.datetime.now() # this ensures that the archive.recapthelaw will get the update document.save() docket = DocketXML.make_docket_for_pdf("", document.court, document.casenum, document.docnum, document.subdocnum, available=0) UploadHandler.do_me_up(docket)
def mark_document_as_unavailable(document): # if not document.available: # print "Exiting: This document isn't currently available on IA" # print usage() # exit() document.available = 0 document.lastdate = datetime.datetime.now() # this ensures that the archive.recapthelaw will get the update document.save() docket = DocketXML.make_docket_for_pdf("", document.court, document.casenum, document.docnum, document.subdocnum, available=0) UploadHandler.do_me_up(docket)
def _cron_process_PDF(obj, ppentry): filename = ppentry.filename meta = IACommon.get_meta_from_filename(filename) court = meta["court"] casenum = meta["casenum"] docnum = meta["docnum"] subdocnum = meta["subdocnum"] invalid_PDF = _is_invalid_pdf(obj, filename) # We only want to check for ssns on valid PDFs # PyPdf doesn't deal well with bad input if not invalid_PDF: # SSN privacy check has_ssn = _has_ssn(obj, filename) else: has_ssn = False # Blacklist file check in_blacklist = _in_blacklist(filename) if invalid_PDF or has_ssn or in_blacklist: docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=0) UploadHandler.do_me_up(docket) # Delete the entry from the DB ppentry.delete() # Quarantine the pickle file for analysis _quarantine_pickle(filename, ssn=has_ssn, blacklist_file= in_blacklist, invalid_PDF= invalid_PDF) return put_result, put_msg = _dispatch_put(obj, ppentry) if put_result: # Put success-- mark this document as available in the DB DocumentManager.mark_as_available(filename) docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum, subdocnum, available=1) UploadHandler.do_me_up(docket) print " %s %s" % (filename, put_msg)
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)",url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y")} if query.subdocnum == 0: subquery = Document.objects.filter(court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude( subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename" : IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y")} jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")
def upload(request): """ Public upload view for all incoming data. """ if request.method != "POST": message = "upload: Not a POST request." logging.error(message) return HttpResponse(message) try: if not request.FILES: message = "upload: No request.FILES attribute." logging.error(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "Client read error (Timeout?)" logging.warning("upload: %s" % message) return HttpResponse(message) except SystemError: message = "Could not parse POST arguments." logging.warning("uploads: %s" % message) return HttpResponse(message) try: data = request.FILES["data"] except KeyError: try: # TK: Only used in testing - get rid of me data = request.FILES["data_file"] except KeyError: message = "upload: No FILES 'data' attribute." logging.error(message) return HttpResponse(message) try: court = request.POST["court"] except KeyError: message = "upload: No POST 'court' attribute." logging.error(message) return HttpResponse(message) else: court = court.strip() if request.POST.get("casenum"): casenum_re = re.compile(r'\d+(-\d+)?') casenum = request.POST["casenum"].strip() if not casenum_re.match(casenum): message = "upload: 'casenum' invalid: %s" % \ request.POST["casenum"] logging.error(message) return HttpResponse(message) else: casenum = None try: mimetype = request.POST["mimetype"].strip() except KeyError: message = "upload: No POST 'mimetype' attribute." logging.error(message) return HttpResponse(message) try: url = request.POST["url"].strip() except KeyError: url = None message = UploadHandler.handle_upload(data, court, casenum, mimetype, url) return HttpResponse(message)
logging.error(message) return HttpResponse(message) # Necessary to preserve backwards compatibility with 0.6 # This param prevents tons of garbage from being printed to # the error console after an Adddocmeta request try: add_case_info = request.POST["add_case_info"] except KeyError: add_case_info = None DocumentManager.handle_adddocmeta(docid, court, casenum, de_seq_num, dm_id, docnum, subdocnum) if add_case_info: response = {"documents": UploadHandler._get_documents_dict(court, casenum), "message": "adddocmeta: DB updated for docid=%s" % (docid) } message = simplejson.dumps(response) else: message = "adddocmeta: DB updated for docid=%s" % (docid) return HttpResponse(message) def lock(request): try: key = request.GET["key"].strip() court = request.GET["court"].strip() casenum = request.GET["casenum"].strip() one_per_uploader = 1 if request.GET.get('one_per_uploader') else 0 except KeyError:
" The contents of filename should have a single case per line, each identified by 'court casenum'\n " ) sys.exit(1) cases_to_repair = read_in_cases_to_repair(sys.argv[1]) for case in cases_to_repair: court = case[0] casenum = case[1] print "Repairing case %s.%s...." % (court, casenum) docket = DocumentManager.create_docket_from_local_documents( court, casenum) if docket: # this will merge our docket with existing one on IA UploadHandler.do_me_up(docket) else: print " Could not create docket from local documents for %s %s" % ( court, casenum) # for each case, create docket fromlocal # call do_me_up(docket) # download ia_docket # # merge ia docket and local docket # # if there is a difference, schedule the docket for upload (created a pickled put?)
def upload(request): """ Public upload view for all incoming data. """ if request.method != "POST": message = "upload: Not a POST request." logging.error(message) return HttpResponse(message) try: if not request.FILES: message = "upload: No request.FILES attribute." logging.error(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "Client read error (Timeout?)" logging.warning("upload: %s" % message) return HttpResponse(message) except SystemError: message = "Could not parse POST arguments." logging.warning("uploads: %s" % message) return HttpResponse(message) try: data = request.FILES["data"] except KeyError: try: # TK: Only used in testing - get rid of me data = request.FILES["data_file"] except KeyError: message = "upload: No FILES 'data' attribute." logging.error(message) return HttpResponse(message) try: court = request.POST["court"] except KeyError: message = "upload: No POST 'court' attribute." logging.error(message) return HttpResponse(message) else: court = court.strip() if request.POST.get("casenum"): casenum = request.POST["casenum"].strip() casenum_re = re.compile(r'\d+(-\d+)?') if not casenum_re.match(casenum) or ":" in casenum: message = "upload: 'casenum' invalid: %s" % request.POST["casenum"] logging.error(message) return HttpResponse(message) else: casenum = None try: mimetype = request.POST["mimetype"].strip() except KeyError: message = "upload: No POST 'mimetype' attribute." logging.error(message) return HttpResponse(message) try: url = request.POST["url"].strip() except KeyError: url = None try: team_name = request.POST['team_name'].strip() except KeyError: team_name = None message = UploadHandler.handle_upload(data, court, casenum, mimetype, url, team_name) return HttpResponse(message)
def query(request): """ Query the database to check which PDF documents we have. The json input is {"court": <court>, "urls": <list of PACER doc1 urls>} The json output is a set of mappings: {<pacer url>: { "filename": <public url>, "timestamp": <last time seen> }, <pacer url>: ... } """ response = {} if request.method != "POST": message = "query: Not a POST request." logging.error(message) return HttpResponse(message) try: jsonin = simplejson.loads(request.POST["json"]) except KeyError: message = "query: no 'json' POST argument" logging.warning(message) return HttpResponse(message) except ValueError: message = "query: malformed 'json' POST argument" logging.warning(message) return HttpResponse(message) except IOError: # Not something we can fix I don't think. Client fails to send data. message = "query: Client read error (Timeout?)" logging.warning(message) return HttpResponse(message) try: court = jsonin["court"].strip() except KeyError: message = "query: missing json 'court' argument." logging.warning(message) return HttpResponse(message) try: urls = jsonin["urls"] except KeyError: message = "query: missing json 'urls' argument." logging.warning(message) return HttpResponse(message) for url in urls: # detect show_doc style document links sdre = re.search("show_doc\.pl\?(.*)", url) if sdre: argsstring = sdre.group(1) args = argsstring.split("&") argsdict = {} for arg in args: (key, val) = arg.split("=") argsdict[key] = val # maybe need to add some checks for whether # these vars exist in argsdict query = Document.objects.filter(court=court) \ .filter(docnum=argsdict["doc_num"]) \ .filter(casenum=argsdict["caseid"]) \ .filter(dm_id=int(argsdict["dm_id"])) \ .filter(available=1) else: # otherwise, assume it's a normal doc1 style url docid = UploadHandler.docid_from_url_name(url) query = Document.objects.filter(docid=docid) \ .filter(available=1) if query: query = query[0] real_casenum = query.casenum response[url] = { "filename": IACommon.get_pdf_url(court, real_casenum, query.docnum, query.subdocnum), "timestamp": query.lastdate.strftime("%m/%d/%y") } if query.subdocnum == 0: subquery = Document.objects.filter( court=court, casenum=query.casenum, docnum=query.docnum, available=1).exclude(subdocnum=0) if len(subquery) > 0: response[url]["subDocuments"] = {} for subDoc in subquery: real_sub_casenum = subDoc.casenum response[url]["subDocuments"][subDoc.subdocnum] = { "filename": IACommon.get_pdf_url(court, real_sub_casenum, subDoc.docnum, subDoc.subdocnum), "timestamp": subDoc.lastdate.strftime("%m/%d/%y") } jsonout = simplejson.dumps(response) return HttpResponse(jsonout, mimetype="application/json")
logging.error(message) return HttpResponse(message) # Necessary to preserve backwards compatibility with 0.6 # This param prevents tons of garbage from being printed to # the error console after an Adddocmeta request try: add_case_info = request.POST["add_case_info"] except KeyError: add_case_info = None DocumentManager.handle_adddocmeta(docid, court, casenum, de_seq_num, dm_id, docnum, subdocnum) if add_case_info: response = { "documents": UploadHandler._get_documents_dict(court, casenum), "message": "adddocmeta: DB updated for docid=%s" % (docid) } message = simplejson.dumps(response) else: message = "adddocmeta: DB updated for docid=%s" % (docid) return HttpResponse(message) def lock(request): try: key = request.GET["key"].strip() court = request.GET["court"].strip() casenum = request.GET["casenum"].strip() one_per_uploader = 1 if request.GET.get('one_per_uploader') else 0
sys.stderr.write("Usage: %s <filename_containing_cases_to_repair>\n " % sys.argv[0]) sys.stderr.write(" The contents of filename should have a single case per line, each identified by 'court casenum'\n " ) sys.exit(1) cases_to_repair = read_in_cases_to_repair(sys.argv[1]) for case in cases_to_repair: court = case[0] casenum = case[1] print "Repairing case %s.%s...." % (court, casenum) docket = DocumentManager.create_docket_from_local_documents(court, casenum) if docket: # this will merge our docket with existing one on IA UploadHandler.do_me_up(docket) else: print " Could not create docket from local documents for %s %s" % (court, casenum) # for each case, create docket fromlocal # call do_me_up(docket) # download ia_docket # # merge ia docket and local docket # # if there is a difference, schedule the docket for upload (created a pickled put?)