def dir(input_dir, output_json, output_html, email, log_level): """ Analyze a record given a directory of dockets relating to a single person and write a plain-english explanation of the analysis. """ if not os.path.exists(input_dir): raise ValueError(f"Directory {input_dir} doesn't exist.") logger.setLevel(log_level) docket_files = [f for f in os.listdir(input_dir) if "docket_sheet" in f] source_records = [] for df in docket_files: parser = pick_pdf_parser(df) if parser is None: continue source_records.append( SourceRecord(get_text_from_pdf(os.path.join(input_dir, df)), parser)) crecord = CRecord() for source_rec in source_records: crecord.add_sourcerecord(source_rec, override_person=True) analysis = (Analysis(crecord).rule(rd.expunge_deceased).rule( rd.expunge_over_70).rule(rd.expunge_nonconvictions).rule( rd.expunge_summary_convictions).rule(rd.seal_convictions)) # email the results. communicate_results(source_records, analysis, output_json, output_html, email) click.echo("Finished.")
def parse_pdf(pdf: Union[BinaryIO, str]) -> Tuple[Person, List[Case], List[str]]: """ PEGParser-based parser method that can take a CP or MD source and return a Summary used to build a CRecord. """ text = get_text_from_pdf(pdf) inputs_dictionary = get_processors(text) summary_page_grammar = inputs_dictionary["summary_page_grammar"] errors = [] try: parsed_pages = summary_page_grammar.parse(text) except Exception as e: # slines = text.split("\n") errors.append(f"Grammar cannot parse summary: {str(e)}") parse_summary = inputs_dictionary["parse_summary"] pages_xml_tree, summary_body_xml_tree = parse_summary(parsed_pages) summary_xml = etree.Element("Summary") summary_xml.append(pages_xml_tree.xpath("//header")[0]) summary_xml.append(pages_xml_tree.xpath("//caption")[0]) summary_xml.append(summary_body_xml_tree) defendant = get_defendant(summary_xml) get_cases = inputs_dictionary["get_cases"] cases = get_cases(summary_xml) return defendant, cases, errors
def re_parse_pdf(path: str) -> Tuple[Person, List[Case], List[str]]: """ Parse, using regex parsers, a pdf of a docket. This function doesn't care what court the docket relates to. It will figure it out. """ # pdf to raw text txt = get_text_from_pdf(path) if txt == "": return None, None, ["could not extract text from pdf"] return re_parse_pdf_text(txt)
def parse_mdj_pdf(path: str) -> Tuple[Person, List[Case], List[str]]: """ Parse an mdj docket, given the path to the docket pdf. This function uses the original Expungement Generator's technique: regexes and nested loops. See https://github.com/NateV/Expungement-Generator/blob/master/Expungement-Generator/Record.php:64 """ # a list of strings errors = [] # pdf to raw text txt = get_text_from_pdf(path) if txt == "": return None, None, ["could not extract text from pdf"] return parse_mdj_pdf_text(txt)
def parse_cp_pdf( pdf: Union[BinaryIO, str]) -> Tuple[Person, List[Case], List[str]]: """ Regex-based parser for CP dockets, including MC and CP. This parser is essentially a Python re-implementation of the original Expungement Generator's parsing methods. The only differences are that this function only handles CP/MC dockets (not MDJ dockets); it only parses, it doesn't include any of Arrest.php's logic related to generating petitions; it abstracts its components into smaller functions; and it reports errors that came up during the parsing process. This function takes the pdf file as a binary or a path, and extracts the text. It sends the text to a more specialized function that does the actual parsing. """ # pdf to raw text txt = get_text_from_pdf(pdf) if txt == "": return None, None, ["could not extract text from pdf"] return parse_cp_pdf_text(txt)
def parse_cp_pdf( pdf: Union[BinaryIO, str], tempdir=None) -> Tuple[Person, List[Case], List[str], etree.Element]: """ Parse the a pdf of a Common Pleas criminal record docket. The 'see' references are to the DocketParse library, which also parses pdf dockets. Args: pdf: a binary reader or a string path to a pdf file. tempdir: The pdf must be written to txt with pdftotext, so we need a temporary directory for it. Returns: The Person to whom the docket relates, and the Case to which the Docket relates. """ # a list of strings errors = [] # pdf to raw text txt = get_text_from_pdf(pdf) if txt == "": return None, None, ["could not extract text from pdf"], None return parse_cp_pdf_text(txt, errors)
def name( first_name, last_name, dob, date_format, output_json, output_dir, output_html, email, log_level, ): """ Screen a person's public criminal record for charges that can be expunged or sealed. """ if output_dir is not None and not os.path.exists(output_dir): raise (ValueError(f"Directory {output_dir} does not exist.")) logger.setLevel(log_level) click.echo(f"Screening {last_name}, {first_name}") starttime = datetime.now() dob = datetime.strptime(dob, date_format).date() # Search UJS for the person's name to collect source records. search_results = search_by_name(first_name, last_name, dob) search_results = search_results["MDJ"] + search_results["CP"] logger.info(f" Found {len(search_results)} cases in the Portal.") # Download the source records # and xtract text from the source records. with tempfile.TemporaryDirectory() as td: for case in search_results: for source_type in ["docket_sheet", "summary"]: try: resp = requests.get( case[f"{source_type}_url"], headers={"User-Agent": "CleanSlateScreener"}, ) except requests.exceptions.MissingSchema as e: # the case search results is missing a url. this happens when # a docket doesn't have a summary, and is fairly common. case[f"{source_type}_text"] = "" continue if resp.status_code != 200: case[f"{source_type}_text"] = "" continue filename = os.path.join( td, f"{case['docket_number']}_{source_type}") with open(filename, "wb") as fp: fp.write(resp.content) case[f"{source_type}_text"] = get_text_from_pdf(filename) if output_dir is not None: for doc in os.listdir(td): shutil.copy(os.path.join(td, doc), os.path.join(output_dir, doc)) logger.info(" Collected texts from cases.") logger.info(f" -time so far:{(datetime.now() - starttime).seconds}") # At this point, search_results looks like a list of search_result dicts, # where each dict also has a key containing the exported text of the docket and the summary. # [ # { # "caption": "", "docket_number": "", "docket_sheet_text": "lots of \ntext", # "summary_text": "lots of text" and other keys. # } # ] # Next read through a Summary and find any docket numbers mentioned. # If any dockets are _not_ already found in the source_records, # collect them from ujs, download them, extract their text, docket_nums = set([case["docket_number"] for case in search_results]) summary_docket_numbers = set() for case in search_results: summary_text = case["summary_text"] other_docket_nums_in_summary = set( re.findall(r"(?:MC|CP)\-\d{2}\-\D{2}\-\d*\-\d{4}", summary_text) + re.findall(r"MJ-\d{5}-\D{2}-\d+-\d{4}", summary_text)) summary_docket_numbers.update(other_docket_nums_in_summary) new_docket_numbers = summary_docket_numbers.difference(docket_nums) logger.info( f" Searched summaries and found {len(new_docket_numbers)} cases not found through portal." ) logger.info(f" -time so far:{(datetime.now() - starttime).seconds}") for dn in new_docket_numbers: cases = search_by_docket(dn) if len(cases) > 0: case = cases[0] else: logger.error(f"Did not find case for docket {dn}") continue search_results.append(case) with tempfile.TemporaryDirectory() as td: for source_type in ["docket_sheet"]: resp = requests.get( case[f"{source_type}_url"], headers={"User-Agent": "CleanSlateScreener"}, ) if resp.status_code != 200: continue filename = os.path.join(td, case["docket_number"]) with open(filename, "wb") as fp: fp.write(resp.content) case[f"{source_type}_text"] = get_text_from_pdf(filename) if output_dir is not None: for doc in os.listdir(td): shutil.copy(os.path.join(td, doc), os.path.join(output_dir, doc))