Exemple #1
0
def test_pdf_factory_bulk(caplog):
    """
    Test parsing a whole directory of dockets.

    N.B. This doesn't currently report a failure if a _section_ of a docket failed to parse. 

    Run w/ `pytest tests/test_docket.py -k bulk -v -o log_cli=true` to show logging, even when test
    doesn't fail. Useful because sections can fail w/out causing the test to fail.
    """
    caplog.set_level(logging.INFO)
    files = os.listdir("tests/data/dockets")
    total_dockets = len(files)
    successes = 0
    error_list = []
    for f in files:
        try:
            logging.info(f"Parsing {f}")
            _, errs = Docket.from_pdf(os.path.join("tests/data/dockets", f),
                                      tempdir="tests/data/tmp")
            if len(errs) > 0:
                error_list = error_list + [(f, errs)]
            successes += 1
            logging.info(f"    {f} parsed.")
        except Exception as e:
            logging.error(f"    {f} failed to parse.")

    if len(error_list) > 0:
        logging.error(f"{len(error_list)} cases had non-fatal parsing errors.")
        pytest.fail(f"{len(error_list)} cases had non-fatal parsing errors.")
    if successes < total_dockets:
        logging.error(f"Only {successes}/{total_dockets} parsed.")
        pytest.fail(f"Only {successes}/{total_dockets} parsed.")
Exemple #2
0
def triage(directory, tempdir, output):
    """
    Read through a set of directories each containing records for a single person. Screen each person for obviously disqualifying elements in their record.
    """
    logging.basicConfig(level=logging.ERROR)
    if not os.path.exists(directory):
        logging.info(f"{directory} does not exist.")
        return
    subdirs = os.listdir(directory)
    recs = []
    logging.info(f"Constructing {len(subdirs)} records.")
    for sd in subdirs:
        rec = CRecord()
        pdfs = glob.glob(os.path.join(directory, sd, "*_Summary.pdf"))
        try:
            for pdf in pdfs: 
                try:
                    rec.add_summary(parse_pdf_summary(pdf, tempdir=tempdir))
                except:
                    try:
                        d, _ = Docket.from_pdf(pdf, tempdir=tempdir)
                        rec.add_docket(d)
                    except Exception as e:
                        raise e
            logging.info(f"Constructed a record for {rec.person.full_name()}, with {len(rec.cases)} cases.")
            recs.append((sd, rec))
        except Exception as e:
            logging.error(f"Error for {sd}: {str(e)}")
    logging.info(f"Now analyzing {len(recs)} records.")
    results = []
    for sd, rec in recs:
        
        res = {
                "dir": sd,
                "name": rec.person.full_name(),
                "cases": len(rec.cases),
                "felony_5_yrs": bool(any_felony_convictions_n_years(rec, 5)),
                "2plus_m1s_15yrs": bool(more_than_x_convictions_y_grade_z_years(rec, 2, "M1", 15)),
                "4plus_m2s_20yrs": bool(more_than_x_convictions_y_grade_z_years(rec, 4, "M2", 20)),
                "any_f1_convictions": not no_f1_convictions(rec),
        }
        res["any_disqualifiers"] = any([
            res["felony_5_yrs"],
            res["2plus_m1s_15yrs"],
            res["4plus_m2s_20yrs"],
            res["any_f1_convictions"],
        ])
        results.append(res)
    with open(output, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "dir", "name", "cases", "felony_5_yrs", "2plus_m1s_15yrs", 
            "4plus_m2s_20yrs", "any_f1_convictions", "any_disqualifiers"])
        writer.writeheader()
        for res in results:
            writer.writerow(res)
    
    logging.info("Complete.")
Exemple #3
0
def test_pdf_factory_one():
    try:
        filename = os.listdir("tests/data/dockets")[0]
        dk, _ = Docket.from_pdf(os.path.join("tests/data/dockets", filename),
                                tempdir="tests/data/tmp")
    except:
        pytest.fail("Cannot create Docket object")
    assert isinstance(dk._case, Case)
    assert isinstance(dk._defendant, Person)
    assert dk._case.affiant is not None
    assert dk._defendant.aliases is not None
    assert dk._case.arresting_agency is not None
Exemple #4
0
def parse(path, doctype, tempdir):
    """
    Parse a pdf file. Probably only useful for testing.
    """
    if doctype == "summary":
        print("Not implemented yet")
    elif doctype == "docket":
        d, errs = Docket.from_pdf(path, tempdir)
        print("---Errors---")
        print(errs)
        print("---Person---")
        print(json.dumps(d._defendant, default=to_serializable))
        print("---Case---")
        print(json.dumps(d._case, default=to_serializable))
    print("Done.") 
Exemple #5
0
    def put(self, request, *args, **kwargs):
        """
        Accept a CRecord and a set of SourceRecords. Incorporate the information that the SourceRecords contain into the CRecord.

        TODO this should replace FileUpload view. 
        """
        try:
            serializer = IntegrateSourcesSerializer(data=request.data)
            if serializer.is_valid():
                crecord = CRecord.from_dict(
                    serializer.validated_data["crecord"])
                for source_record_data in serializer.validated_data[
                        "source_records"]:
                    source_record = SourceRecord.objects.get(
                        id=source_record_data["id"])
                    if source_record.record_type == SourceRecord.RecTypes.SUMMARY_PDF:
                        summary = parse_pdf(source_record.file.path)
                        crecord.add_summary(
                            summary,
                            case_merge_strategy="overwrite_old",
                            override_person=True)
                    elif source_record.record_type == SourceRecord.RecTypes.DOCKET_PDF:
                        docket, errs = Docket.from_pdf(source_record.file.path)
                        crecord.add_docket(docket)
                    else:
                        logger.error(
                            f"Cannot parse a source record with type {source_record.record_type}"
                        )
                return Response({'crecord': CRecordSerializer(crecord).data},
                                status=status.HTTP_200_OK)
            else:
                return Response({"errors": serializer.errors},
                                status=status.HTTP_400_BAD_REQUEST)
        except Exception as err:
            return Response({"errors": [str(err)]},
                            status=status.HTTP_500_INTERNAL_SERVER_ERROR)
Exemple #6
0
def example_docket():
    docket_path = os.listdir("tests/data/dockets")[0]
    d, errs = Docket.from_pdf(os.path.join("tests","data","dockets",docket_path), tempdir="tests/data/tmp")
    return d