def handle_adddocmeta(docid, court, casenum, de_seq_num, dm_id, docnum, subdocnum): docid = ParsePacer.coerce_docid(docid) query = Document.objects.filter(court=court, casenum=casenum, docnum=docnum, subdocnum=subdocnum) try: doc = query[0] except IndexError: doc = Document( docid=docid, court=court, casenum=casenum, de_seq_num=de_seq_num, dm_id=dm_id, docnum=docnum, subdocnum=subdocnum, ) else: doc.de_seq_num = de_seq_num doc.dm_id = dm_id doc.docnum = docnum doc.docid = docid try: doc.save() except IntegrityError: logging.error("handle_adddocmeta: could not save docid %s" % (docid))
def docid_from_url_name(url): """ Extract the docid from a PACER URL name. """ if doc_re.search(url): return ParsePacer.coerce_docid(doc_re.search(url).group(1)) if ca_doc_re.search(url): return ca_doc_re.search(url).group(1) or ca_doc_re.search(url).group(2) raise ValueError('docid_from_url_name')
def create_docket_pickles(path, court): #check if docket pickle directory exists and pickle generation has completed opinion_reports= glob.glob(os.path.join(path, "*.opinions.*")) try: os.mkdir(os.path.join(path, 'docket_pickles')) except OSError: #delete docket_pickles pass for report in opinion_reports: filebits = open(report).read() dockets = PP.parse_opinions(filebits, court) if dockets: print "Found %s dockets in %s " % (len(dockets), report) for docket in dockets: if len(docket.documents) != 1: raise "This docket has more than one document! docket text: " % docket doc = docket.documents.values()[0] filename = _get_docket_pickle_filename(court, doc['casenum'], doc['doc_num'], doc['attachment_num']) success, msg = IA.pickle_object(docket, filename, os.path.join(path, "docket_pickles")) if not success: print "Error pickling file %s: %s " % (filename, msg)
def handle_adddocmeta(docid, court, casenum, de_seq_num, dm_id, docnum, subdocnum): docid = ParsePacer.coerce_docid(docid) query = Document.objects.filter(court=court, casenum=casenum, docnum=docnum, subdocnum=subdocnum) try: doc = query[0] except IndexError: doc = Document(docid=docid, court=court, casenum=casenum, de_seq_num=de_seq_num, dm_id=dm_id, docnum=docnum, subdocnum=subdocnum) else: doc.de_seq_num = de_seq_num doc.dm_id = dm_id doc.docnum = docnum doc.docid = docid try: doc.save() except IntegrityError: logging.error("handle_adddocmeta: could not save docid %s" % (docid))
def get_opinions(self, court, start_date, end_date): html = self.pacer_client.get_opinions_html(court, start_date, end_date) dockets = PP.parse_opinions(html, court) logger.info(' Downloaded %d dockets for court %s between %s and %s', len(dockets), court, start_date, end_date) #if len(dockets) == 0: # logger.debug(' 0 dockets downloaded. HTML response: %s', html) return dockets
def docid_from_url_name(url): """ Extract the docid from a PACER URL name. CA sometimes have: /cmecf/servlet/TransportRoom?servlet=ShowDoc&dls_id=00404800657&caseId=124912&dktType=dktPublic """ if doc_re.search(url): return ParsePacer.coerce_docid(doc_re.search(url).group(1)) if ca_doc_re.search(url): return ca_doc_re.search(url).group(1) or ca_doc_re.search(url).group(2) raise ValueError('docid_from_url_name')
def handle_doc1(filebits, court, filename, team_name): """ Write HTML (doc1) file metadata into the database. """ logging.debug('handle_doc1 %s %s', court, filename) docid = docid_from_url_name(filename) query = Document.objects.filter(docid=docid) try: main_doc = query[0] except IndexError: logging.info("handle_doc1: unknown docid %s" % (docid)) return "upload: doc1 ignored." else: casenum = main_doc.casenum main_docnum = main_doc.docnum # Sanity check if court != main_doc.court: logging.error("handle_doc1: court mismatch (%s, %s) %s" % (court, main_doc.court, docid)) return "upload: doc1 metadata mismatch." if ParsePacer.is_appellate(court): docket = ParsePacer.parse_ca_doc1(filebits, court, casenum, main_docnum) else: docket = ParsePacer.parse_doc1(filebits, court, casenum, main_docnum) if docket: # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket, team_name=team_name) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message": "doc1 successfully parsed."} message = simplejson.dumps(response) return message
def handle_cadkt(filebits, court, casenum, is_full=False): docket = ParsePacer.parse_cadkt(filebits, court, casenum, is_full) if not docket: return "upload: could not parse docket." # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message":"DktRpt successfully parsed."} message = simplejson.dumps(response) return message
def handle_cadkt(filebits, court, casenum, team_name, is_full=False): docket = ParsePacer.parse_cadkt(filebits, court, casenum, is_full) if not docket: return "upload: could not parse docket." # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket, team_name=team_name) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message": "DktRpt successfully parsed."} message = simplejson.dumps(response) return message
def handle_histdocqry(filebits, court, casenum, team_name): docket = ParsePacer.parse_histdocqry(filebits, court, casenum) if not docket: return "upload: could not parse docket." # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket, team_name=team_name) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message": "HistDocQry successfully parsed."} message = simplejson.dumps(response) return message
def handle_dktrpt(filebits, court, casenum): if config['DUMP_DOCKETS'] and re.search(config['DUMP_DOCKETS_COURT_REGEX'], court): logging.info("handle_dktrpt: Dumping docket %s.%s for debugging" % (court, casenum)) _dump_docket_for_debugging(filebits,court,casenum) docket = ParsePacer.parse_dktrpt(filebits, court, casenum) if not docket: return "upload: could not parse docket." # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message":"DktRpt successfully parsed."} message = simplejson.dumps(response) return message
def handle_dktrpt(filebits, court, casenum, team_name): if config.DUMP_DOCKETS and re.search(config.DUMP_DOCKETS_COURT_REGEX, court): logging.info("handle_dktrpt: Dumping docket %s.%s for debugging" % ( court, casenum)) _dump_docket_for_debugging(filebits, court, casenum) docket = ParsePacer.parse_dktrpt(filebits, court, casenum) if not docket: return "upload: could not parse docket." # Merge the docket with IA do_me_up(docket) # Update the local DB DocumentManager.update_local_db(docket, team_name=team_name) response = {"cases": _get_cases_dict(casenum, docket), "documents": _get_documents_dict(court, casenum), "message": "DktRpt successfully parsed."} message = simplejson.dumps(response) return message
def process_case(casenum): # Setup: Grab the lock. got_lock, nonce_or_message = lock(court, casenum) if got_lock: print "got the lock: %s" % (nonce_or_message) nonce = nonce_or_message else: print "could not get lock: %s" % (nonce_or_message) add_to_retry(casenum) return False casedir = "%s/%s" % (dirarg, casenum) # Step 1: Parse the docket.html file. try: docketpath = "%s/docket.html" % casedir docketfile = open(docketpath) docketbits = docketfile.read() docketfile.close() except IOError: reason = "could not open local docket" print "***Skipping %s.%s: %s... " % (court, casenum, reason), print_unlock_message(unlock(court, casenum, False)) del_from_retry(casenum) add_to_failed(casenum, reason) return False else: docket = ParsePacer.parse_histdocqry(docketbits, court, casenum) if not docket: reason = "could not parse local docket" print "***Skipping %s.%s: %s... " % (court, casenum, reason), print_unlock_message(unlock(court, casenum, False)) del_from_retry(casenum) add_to_failed(casenum, reason) return False # Step 1a: Try to fetch the the existing IA docket. ia_docket = None ia_docket_orig_string = "" ia_casemeta_orig_hash = "" ia_docketstring, fetcherror = IADirect.get_docket_string(court, casenum) if ia_docketstring: # Got the existing docket-- parse it. ia_docket, parseerror = DocketXML.parse_xml_string(ia_docketstring) if not ia_docket: reason = "could not parse IA docket: %s" % (parseerror) print "***Skipping %s.%s: %s... " % (court, casenum, reason), print_unlock_message(unlock(court, casenum, False)) del_from_retry(casenum) add_to_failed(casenum, reason) return False else: # Save the original docket hashes ia_docket_orig_string = ia_docketstring ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta)) elif fetcherror is IADirect.FETCH_NO_FILE: # Bucket exists but no docket-- ok. pass elif fetcherror is IADirect.FETCH_NO_BUCKET: # Bucket doesn't exist-- either make_bucket failed or not yet ready. if casenum not in bucket_made: # If make_bucket failed, try make_bucket again. print " make bucket...", make_bucket(casenum) elif fetcherror is IADirect.FETCH_TIMEOUT: # Couldn't contact IA, skip. print "***Skipping %s.%s: IA is down... " % (court, casenum), print_unlock_message(unlock(court, casenum, False)) add_to_retry(casenum) return False elif not ia_docketstring: # Unknown fetch error, skip. print "***Skipping %s.%s: unknown docket fetch error: %s..." % \ (court, casenum, fetcherror), print_unlock_message(unlock(court, casenum, False)) add_to_retry(casenum) return False # Step 1b: If necessary, merge the two dockets. if ia_docket: ia_docket.merge_docket(docket) else: ia_docket = docket casedir_ls = os.listdir(casedir) index_ls = [] pdf_ls = [] for casedocname in casedir_ls: if casedocname.endswith("index.html"): index_ls.append(casedocname) elif casedocname.endswith(".pdf"): pdf_ls.append(casedocname) # Step 2: Parse each index file for indexname in index_ls: try: indexpath = "%s/%s" % (casedir, indexname) indexfile = open(indexpath) indexbits = indexfile.read() indexfile.close() except IOError: print "***Could not open file '%s'" % indexpath continue docnum = indexname.strip("-index.html") index_docket = ParsePacer.parse_doc1(indexbits, court, casenum, docnum) # Merge this docket into the IA docket ia_docket.merge_docket(index_docket) # Set initial flag for retrying this case. need_to_retry = 0 # Step 3: Wait for the bucket to be ready bucketready = False for checkcount in xrange(20): bucketready, code = IADirect.check_bucket_ready(court, casenum) if bucketready: break else: # Wait 5 seconds and try again. time.sleep(5) if not bucketready: print "***Skipping %s.%s: bucket is not ready... " \ % (court, casenum), print_unlock_message(unlock(court, casenum, False)) add_to_retry(casenum) return False # Step 4: Upload each pdf file. doccount = 0 for pdfname in pdf_ls: doccount += 1 print " uploading document %d/%d..." % (doccount, len(pdf_ls)), try: pdfpath = "%s/%s" % (casedir, pdfname) pdffile = open(pdfpath) pdfbits = pdffile.read() pdffile.close() except IOError: print "***Could not open file '%s'" % pdfpath continue pdfname = pdfname.strip(".pdf") split = pdfname.split("-") try: docnum = unicode(int(split[0])) except ValueError: # Not an integer. print "***Docnum not an integer '%s'" % pdfpath continue try: # converting v3->v4 subdocnums subdocnum = unicode(int(split[1]) - 1) except IndexError: subdocnum = "0" doc_docket = DocketXML.make_docket_for_pdf(pdfbits, court, casenum, docnum, subdocnum) doc_meta = doc_docket.get_document_metadict(docnum, subdocnum) # Only upload the PDF if the hash doesn't match the one in IA. ia_pdfhash = ia_docket.get_document_sha1(docnum, subdocnum) pdfhash = doc_docket.get_document_sha1(docnum, subdocnum) if ia_pdfhash != pdfhash: pdfstatus, pdferror = \ IADirect.put_pdf(pdfbits, court, casenum, docnum, subdocnum, doc_meta) if not pdfstatus: # PUT failed, mark document as unavailable doc_docket.set_document_available(docnum, subdocnum, "0") print " fail: %s" % pdferror need_to_retry = True continue else: print "done." # Add this document's metadata into the ia_docket ia_docket.merge_docket(doc_docket) else: print "same." # Step 5: Push the docket to IA, if things have changed. print " docket upload...", docket_modified = 0 ignore_nonce = 0 ia_docket_merged_string = ia_docket.to_xml() if ia_docket_orig_string != ia_docket_merged_string: # Assign the docket the new nonce from the lock ia_docket.nonce = nonce ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta)) casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash putstatus, puterror = \ IADirect.put_docket(ia_docket, court, casenum, casemeta_diff=casemeta_diff) if putstatus: docket_modified = 1 print "done." else: need_to_retry = 1 print "fail: %s" % puterror else: ignore_nonce = 1 print "same." if ignore_nonce: print_unlock_message(unlock(court, casenum, ignore_nonce=1)) else: print_unlock_message(unlock(court, casenum, modified=docket_modified)) if need_to_retry: add_to_retry(casenum) return False else: return True