def test_pdf_factory_bulk(caplog): """ Test parsing a whole directory of dockets. N.B. This doesn't currently report a failure if a _section_ of a docket failed to parse. Run w/ `pytest tests/test_docket.py -k bulk -v -o log_cli=true` to show logging, even when test doesn't fail. Useful because sections can fail w/out causing the test to fail. """ caplog.set_level(logging.INFO) files = os.listdir("tests/data/dockets") total_dockets = len(files) successes = 0 error_list = [] for f in files: try: logging.info(f"Parsing {f}") _, errs = Docket.from_pdf(os.path.join("tests/data/dockets", f), tempdir="tests/data/tmp") if len(errs) > 0: error_list = error_list + [(f, errs)] successes += 1 logging.info(f" {f} parsed.") except Exception as e: logging.error(f" {f} failed to parse.") if len(error_list) > 0: logging.error(f"{len(error_list)} cases had non-fatal parsing errors.") pytest.fail(f"{len(error_list)} cases had non-fatal parsing errors.") if successes < total_dockets: logging.error(f"Only {successes}/{total_dockets} parsed.") pytest.fail(f"Only {successes}/{total_dockets} parsed.")
def triage(directory, tempdir, output): """ Read through a set of directories each containing records for a single person. Screen each person for obviously disqualifying elements in their record. """ logging.basicConfig(level=logging.ERROR) if not os.path.exists(directory): logging.info(f"{directory} does not exist.") return subdirs = os.listdir(directory) recs = [] logging.info(f"Constructing {len(subdirs)} records.") for sd in subdirs: rec = CRecord() pdfs = glob.glob(os.path.join(directory, sd, "*_Summary.pdf")) try: for pdf in pdfs: try: rec.add_summary(parse_pdf_summary(pdf, tempdir=tempdir)) except: try: d, _ = Docket.from_pdf(pdf, tempdir=tempdir) rec.add_docket(d) except Exception as e: raise e logging.info(f"Constructed a record for {rec.person.full_name()}, with {len(rec.cases)} cases.") recs.append((sd, rec)) except Exception as e: logging.error(f"Error for {sd}: {str(e)}") logging.info(f"Now analyzing {len(recs)} records.") results = [] for sd, rec in recs: res = { "dir": sd, "name": rec.person.full_name(), "cases": len(rec.cases), "felony_5_yrs": bool(any_felony_convictions_n_years(rec, 5)), "2plus_m1s_15yrs": bool(more_than_x_convictions_y_grade_z_years(rec, 2, "M1", 15)), "4plus_m2s_20yrs": bool(more_than_x_convictions_y_grade_z_years(rec, 4, "M2", 20)), "any_f1_convictions": not no_f1_convictions(rec), } res["any_disqualifiers"] = any([ res["felony_5_yrs"], res["2plus_m1s_15yrs"], res["4plus_m2s_20yrs"], res["any_f1_convictions"], ]) results.append(res) with open(output, 'w') as f: writer = csv.DictWriter(f, fieldnames=[ "dir", "name", "cases", "felony_5_yrs", "2plus_m1s_15yrs", "4plus_m2s_20yrs", "any_f1_convictions", "any_disqualifiers"]) writer.writeheader() for res in results: writer.writerow(res) logging.info("Complete.")
def test_pdf_factory_one(): try: filename = os.listdir("tests/data/dockets")[0] dk, _ = Docket.from_pdf(os.path.join("tests/data/dockets", filename), tempdir="tests/data/tmp") except: pytest.fail("Cannot create Docket object") assert isinstance(dk._case, Case) assert isinstance(dk._defendant, Person) assert dk._case.affiant is not None assert dk._defendant.aliases is not None assert dk._case.arresting_agency is not None
def parse(path, doctype, tempdir): """ Parse a pdf file. Probably only useful for testing. """ if doctype == "summary": print("Not implemented yet") elif doctype == "docket": d, errs = Docket.from_pdf(path, tempdir) print("---Errors---") print(errs) print("---Person---") print(json.dumps(d._defendant, default=to_serializable)) print("---Case---") print(json.dumps(d._case, default=to_serializable)) print("Done.")
def put(self, request, *args, **kwargs): """ Accept a CRecord and a set of SourceRecords. Incorporate the information that the SourceRecords contain into the CRecord. TODO this should replace FileUpload view. """ try: serializer = IntegrateSourcesSerializer(data=request.data) if serializer.is_valid(): crecord = CRecord.from_dict( serializer.validated_data["crecord"]) for source_record_data in serializer.validated_data[ "source_records"]: source_record = SourceRecord.objects.get( id=source_record_data["id"]) if source_record.record_type == SourceRecord.RecTypes.SUMMARY_PDF: summary = parse_pdf(source_record.file.path) crecord.add_summary( summary, case_merge_strategy="overwrite_old", override_person=True) elif source_record.record_type == SourceRecord.RecTypes.DOCKET_PDF: docket, errs = Docket.from_pdf(source_record.file.path) crecord.add_docket(docket) else: logger.error( f"Cannot parse a source record with type {source_record.record_type}" ) return Response({'crecord': CRecordSerializer(crecord).data}, status=status.HTTP_200_OK) else: return Response({"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST) except Exception as err: return Response({"errors": [str(err)]}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def example_docket(): docket_path = os.listdir("tests/data/dockets")[0] d, errs = Docket.from_pdf(os.path.join("tests","data","dockets",docket_path), tempdir="tests/data/tmp") return d
def test_init(): try: dk = Docket() except: pytest.fail("Cannot create Docket object")