Python get_text_from_pdfの例、RecordLib.sourcerecords.parsingutilities.get_text_from_pdf Pythonの例

コード例 #1

0

ファイルを表示

ファイル: csscreen.py プロジェクト: speedandfunction/RecordLib

def dir(input_dir, output_json, output_html, email, log_level):
    """
    Analyze a record given a directory of dockets relating to a single person and write a plain-english 
    explanation of the analysis.
    """
    if not os.path.exists(input_dir):
        raise ValueError(f"Directory {input_dir} doesn't exist.")

    logger.setLevel(log_level)
    docket_files = [f for f in os.listdir(input_dir) if "docket_sheet" in f]

    source_records = []
    for df in docket_files:
        parser = pick_pdf_parser(df)
        if parser is None:
            continue
        source_records.append(
            SourceRecord(get_text_from_pdf(os.path.join(input_dir, df)),
                         parser))

    crecord = CRecord()
    for source_rec in source_records:
        crecord.add_sourcerecord(source_rec, override_person=True)

    analysis = (Analysis(crecord).rule(rd.expunge_deceased).rule(
        rd.expunge_over_70).rule(rd.expunge_nonconvictions).rule(
            rd.expunge_summary_convictions).rule(rd.seal_convictions))

    # email the results.
    communicate_results(source_records, analysis, output_json, output_html,
                        email)

    click.echo("Finished.")

コード例 #2

0

ファイルを表示

def parse_pdf(pdf: Union[BinaryIO, str]) -> Tuple[Person, List[Case], List[str]]:
    """
    PEGParser-based parser method that can take a CP or MD source and return a Summary
    used to build a CRecord.
    """
    text = get_text_from_pdf(pdf)
    inputs_dictionary = get_processors(text)
    summary_page_grammar = inputs_dictionary["summary_page_grammar"]
    errors = []
    try:
        parsed_pages = summary_page_grammar.parse(text)
    except Exception as e:
        # slines = text.split("\n")
        errors.append(f"Grammar cannot parse summary: {str(e)}")

    parse_summary = inputs_dictionary["parse_summary"]
    pages_xml_tree, summary_body_xml_tree = parse_summary(parsed_pages)

    summary_xml = etree.Element("Summary")
    summary_xml.append(pages_xml_tree.xpath("//header")[0])
    summary_xml.append(pages_xml_tree.xpath("//caption")[0])
    summary_xml.append(summary_body_xml_tree)
    defendant = get_defendant(summary_xml)
    get_cases = inputs_dictionary["get_cases"]
    cases = get_cases(summary_xml)
    return defendant, cases, errors

コード例 #3

0

ファイルを表示

ファイル: re_parse_pdf.py プロジェクト: speedandfunction/RecordLib

def re_parse_pdf(path: str) -> Tuple[Person, List[Case], List[str]]:
    """
    Parse, using regex parsers, a pdf of a docket. This function doesn't care what court the docket relates to. It will figure it out.
    """
    # pdf to raw text
    txt = get_text_from_pdf(path)
    if txt == "":
        return None, None, ["could not extract text from pdf"]

    return re_parse_pdf_text(txt)

コード例 #4

0

ファイルを表示

ファイル: re_parse_mdj_pdf.py プロジェクト: speedandfunction/RecordLib

def parse_mdj_pdf(path: str) -> Tuple[Person, List[Case], List[str]]:
    """
    Parse an mdj docket, given the path to the docket pdf.

    This function uses the original Expungement Generator's technique: regexes and nested loops.

    See https://github.com/NateV/Expungement-Generator/blob/master/Expungement-Generator/Record.php:64
    """
    # a list of strings
    errors = []
    # pdf to raw text
    txt = get_text_from_pdf(path)
    if txt == "":
        return None, None, ["could not extract text from pdf"]
    return parse_mdj_pdf_text(txt)

コード例 #5

0

ファイルを表示

def parse_cp_pdf(
        pdf: Union[BinaryIO, str]) -> Tuple[Person, List[Case], List[str]]:
    """
    Regex-based parser for CP dockets, including MC and CP.

    This parser is essentially a Python re-implementation of the original
    Expungement Generator's parsing methods. 
    The only differences are that this function only handles CP/MC dockets (not MDJ dockets);
    it only parses, it doesn't include any of Arrest.php's logic
    related to generating petitions; it abstracts its components into smaller functions; and it
    reports errors that came up during the parsing process.

    This function takes the pdf file as a binary or a path, and extracts the text.
    It sends the text to a more
    specialized function that does the actual parsing.

    """
    # pdf to raw text
    txt = get_text_from_pdf(pdf)
    if txt == "":
        return None, None, ["could not extract text from pdf"]
    return parse_cp_pdf_text(txt)

コード例 #6

0

ファイルを表示

def parse_cp_pdf(
        pdf: Union[BinaryIO, str],
        tempdir=None) -> Tuple[Person, List[Case], List[str], etree.Element]:
    """
    Parse the a pdf of a Common Pleas criminal record docket. 

    The 'see' references are to the DocketParse library, which also parses pdf dockets. 

    Args:
        pdf: a binary reader or a string path to a pdf file.
        tempdir: The pdf must be written to txt with pdftotext, so we need a temporary directory for it.
    
    Returns:
        The Person to whom the docket relates, and the Case to which the Docket relates.
    """
    # a list of strings
    errors = []
    # pdf to raw text
    txt = get_text_from_pdf(pdf)
    if txt == "":
        return None, None, ["could not extract text from pdf"], None
    return parse_cp_pdf_text(txt, errors)

コード例 #7

0

ファイルを表示

ファイル: csscreen.py プロジェクト: speedandfunction/RecordLib

def name(
    first_name,
    last_name,
    dob,
    date_format,
    output_json,
    output_dir,
    output_html,
    email,
    log_level,
):
    """
    Screen a person's public criminal record for charges that can be expunged or sealed.
    """
    if output_dir is not None and not os.path.exists(output_dir):
        raise (ValueError(f"Directory {output_dir} does not exist."))
    logger.setLevel(log_level)
    click.echo(f"Screening {last_name}, {first_name}")
    starttime = datetime.now()
    dob = datetime.strptime(dob, date_format).date()
    # Search UJS for the person's name to collect source records.
    search_results = search_by_name(first_name, last_name, dob)
    search_results = search_results["MDJ"] + search_results["CP"]
    logger.info(f"    Found {len(search_results)} cases in the Portal.")
    # Download the source records
    # and xtract text from the source records.
    with tempfile.TemporaryDirectory() as td:
        for case in search_results:
            for source_type in ["docket_sheet", "summary"]:
                try:
                    resp = requests.get(
                        case[f"{source_type}_url"],
                        headers={"User-Agent": "CleanSlateScreener"},
                    )
                except requests.exceptions.MissingSchema as e:
                    # the case search results is missing a url. this happens when
                    # a docket doesn't have a summary, and is fairly common.
                    case[f"{source_type}_text"] = ""
                    continue
                if resp.status_code != 200:
                    case[f"{source_type}_text"] = ""
                    continue
                filename = os.path.join(
                    td, f"{case['docket_number']}_{source_type}")
                with open(filename, "wb") as fp:
                    fp.write(resp.content)
                case[f"{source_type}_text"] = get_text_from_pdf(filename)
        if output_dir is not None:
            for doc in os.listdir(td):
                shutil.copy(os.path.join(td, doc),
                            os.path.join(output_dir, doc))
    logger.info("   Collected texts from cases.")
    logger.info(f"   -time so far:{(datetime.now() - starttime).seconds}")
    # At this point, search_results looks like a list of search_result dicts,
    # where each dict also has a key containing the exported text of the docket and the summary.
    # [
    #       {
    #           "caption": "", "docket_number": "", "docket_sheet_text": "lots of \ntext",
    #           "summary_text": "lots of text" and other keys.
    #       }
    # ]

    # Next read through a Summary and find any docket numbers mentioned.
    # If any dockets are _not_ already found in the source_records,
    # collect them from ujs, download them, extract their text,
    docket_nums = set([case["docket_number"] for case in search_results])
    summary_docket_numbers = set()
    for case in search_results:
        summary_text = case["summary_text"]
        other_docket_nums_in_summary = set(
            re.findall(r"(?:MC|CP)\-\d{2}\-\D{2}\-\d*\-\d{4}", summary_text) +
            re.findall(r"MJ-\d{5}-\D{2}-\d+-\d{4}", summary_text))
        summary_docket_numbers.update(other_docket_nums_in_summary)

    new_docket_numbers = summary_docket_numbers.difference(docket_nums)
    logger.info(
        f"    Searched summaries and found {len(new_docket_numbers)} cases not found through portal."
    )

    logger.info(f"   -time so far:{(datetime.now() - starttime).seconds}")
    for dn in new_docket_numbers:
        cases = search_by_docket(dn)
        if len(cases) > 0:
            case = cases[0]
        else:
            logger.error(f"Did not find case for docket {dn}")
            continue
        search_results.append(case)
        with tempfile.TemporaryDirectory() as td:
            for source_type in ["docket_sheet"]:
                resp = requests.get(
                    case[f"{source_type}_url"],
                    headers={"User-Agent": "CleanSlateScreener"},
                )
                if resp.status_code != 200:
                    continue
                filename = os.path.join(td, case["docket_number"])
                with open(filename, "wb") as fp:
                    fp.write(resp.content)
                case[f"{source_type}_text"] = get_text_from_pdf(filename)
            if output_dir is not None:
                for doc in os.listdir(td):
                    shutil.copy(os.path.join(td, doc),
                                os.path.join(output_dir, doc))