Exemple #1
1
def test_ml_completion():
    """Simply test that ML-based parse runs without errors."""
    output = pdftotree.parse(
        "tests/input/paleo.pdf",
        model_type="ml",
        model_path="tests/input/paleo_model.pkl",
    )
    assert output is not None
Exemple #2
0
def test_LTChar_under_LTFigure(tmp_path):
    """Test on a PDF where LTChar(s) are children of LTFigure."""
    html_path = os.path.join(tmp_path, "paleo.html")
    pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf",
                    html_path)
    with open(html_path) as f:
        soup = BeautifulSoup(f, "lxml")
    line: Tag = soup.find(class_="ocrx_line")
    assert [word.text for word in line.find_all(class_="ocrx_word")] == [
        "Small",
        "Signal",
        "Transistors",
    ]

    # The table in the 1st page should contain 18 columns
    page = soup.find(class_="ocr_page")
    table = page.find(class_="ocr_table")
    assert len(table.find("tr").find_all("td")) == 18
    assert get_bbox(table) is not None

    # Find a cell containing one or more of ocrx_word and check if it has bbox
    cell = table.find(class_="ocrx_word").parent.parent
    assert get_bbox(cell) is not None

    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
        assert all(
            [line.decode("utf-8").startswith("ok") for line in proc.stderr])
Exemple #3
0
def test_output_should_conform_to_hocr(tmp_path):
    """Test if an exported file conform to hOCR."""
    html_path = os.path.join(tmp_path, "md.html")
    pdftotree.parse("tests/input/md.pdf", html_path)
    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
        assert all(
            [line.decode("utf-8").startswith("ok") for line in proc.stderr])
Exemple #4
0
def test_no_out_of_order(caplog):
    """Test if no out of order warning is issued."""
    pdftotree.parse("tests/input/md.pdf")
    assert "Out of order" not in caplog.text

    pdftotree.parse("tests/input/paleo.pdf")
    assert "Out of order" not in caplog.text
Exemple #5
0
def converter(ifile: str, ofile: str):
    """
    convert pdf to html using pdftotree
    :params ifile: input pdf file
    :params ofile: output html file
    """
    pdftotree.parse(ifile, html_path=ofile,
            model_type=None, model_path=None, favor_figures=True, visualize=False)
Exemple #6
0
def test_looks_scanned():
    """Test on a PDF that looks like a scanned one but not.

    CaseStudy_ACS.pdf contains a transparent image overlaying the entire page.
    This overlaying transparent image fools TreeExtractor into thinking it is scanned.
    """
    output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf", favor_figures="True")
    assert output.count("ocrx_word") == 1  # single appearance in ocr-capabilities
    output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf", favor_figures="False")
    assert output.count("ocrx_word") >= 1000
    def convert_pdf_to_html(self, document_path):
        name, extension = self.get_document_name_and_extension(document_path)
        new_filename = name + '.' + 'html'
        location_path = os.path.join(MEDIA_ROOT, new_filename)

        pdftotree.parse(document_path,
                        html_path=location_path,
                        model_type=None,
                        model_path=None,
                        visualize=False)
        return location_path
Exemple #8
0
def test_tabula_warning_suppressed(caplog):
    """Test if tabula warnings are suppressed."""
    # Warnings suppressed by default
    pdftotree.parse("tests/input/112823.pdf")
    assert "org.apache.pdfbox" not in caplog.text

    # Not to suppress warnings
    log = logging.getLogger("pdftotree")
    log.setLevel(logging.DEBUG)
    pdftotree.parse("tests/input/112823.pdf")
    assert "org.apache.pdfbox" in caplog.text
Exemple #9
0
def main():
    file_lyst = []
    for file in os.listdir("./Split"):
        filename = os.fsdecode(file)
        if filename.endswith(".pdf"):
            fyle = "./Split/" + filename
            file_lyst.append(fyle)

    for i in file_lyst:
        pdftotree.parse(i, html_path="./HTML/")
        print(i + " Converted to HTML \n")
Exemple #10
0
def test_text_is_escaped():
    """Test if text is properly escaped."""
    output = pdftotree.parse("tests/input/md.pdf")
    soup = BeautifulSoup(output, "lxml")
    words = soup.find_all(class_="ocrx_word")
    # Use str() instead of .text as the latter gives unescaped text.
    m = re.search(r">(.+?)<", str(words[66]))
    assert m[1] == "'bar';."

    output = pdftotree.parse("tests/input/112823.pdf")
    soup = BeautifulSoup(output, "lxml")
    words = soup.find_all(class_="ocrx_word")
    m = re.search(r">(.+?)<", str(words[152]))
    assert m[1] == "&amp;"
Exemple #11
0
def test_figures():
    output = pdftotree.parse("tests/input/md.pdf")
    soup = BeautifulSoup(output, "lxml")
    imgs = soup.find_all("img")
    assert len(imgs) == 1

    output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf")
    soup = BeautifulSoup(output, "lxml")
    imgs = soup.find_all("img")
    # 3 jpg, 2 bmp, 5 total images
    assert len(imgs) == 5
    assert len(
        [img for img in imgs if img["src"].startswith("data:image/jpeg")]) == 3
    assert len(
        [img for img in imgs if img["src"].startswith("data:image/bmp")]) == 2
Exemple #12
0
def visual_parsing():
    """Simply test that ML-based parse runs without errors."""
    output = pdftotree.parse(input_pdf_file,
                             html_path=output_html_file,
                             model_type="visual",
                             model_path="input/paleo_visual_model.h5")
    return output
Exemple #13
0
def test_vision_completion():
    """Simply test that vision-based parse runs without errors."""
    output = pdftotree.parse(
        "tests/input/paleo.pdf",
        model_type="vision",
        model_path="tests/input/paleo_visual_model.h5",
    )
    assert output is not None
Exemple #14
0
def test_vision_completion():
    """Simply test that vision-based parse runs without errors."""
    output = pdftotree.parse(
        "tests/input/paleo.pdf",
        model_type="vision",
        model_path="tests/input/paleo_visual_model.h5",
    )
    soup = BeautifulSoup(output, "lxml")
    assert len(soup.find_all("table")) == 2
Exemple #15
0
def heuristic_parsing():
    """Simply test that parse runs to completion without errors."""
    output = pdftotree.parse(input_pdf_file,
                             html_path=output_html_file,
                             model_type=None,
                             model_path=None,
                             favor_figures=True,
                             visualize=False)
    return output
def test_cell_values_not_missing():
    output = pdftotree.parse("tests/input/md.pdf")
    soup = BeautifulSoup(output, "lxml")
    table = soup.find(class_="ocr_table")
    assert list(table.find_all("tr")[3].stripped_strings) == [
        "Erin",
        "lamb",
        "madras",
        "HOT",
        "$5",
    ]
Exemple #17
0
def test_output_should_conform_to_hocr(tmp_path):
    """Test if an exported file conform to hOCR."""
    html_path = os.path.join(tmp_path, "md.html")
    pdftotree.parse("tests/input/md.pdf", html_path)
    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
        assert all(
            [line.decode("utf-8").startswith("ok") for line in proc.stderr])

    # Check detailed things that hocr-check does not check.
    with open(html_path) as fp:
        soup = BeautifulSoup(fp, "lxml")
    capabilities = soup.find("meta", attrs={"name": "ocr-capabilities"})
    # Check the list as hocr-check only checks the existence of "ocr-capabilities".
    assert capabilities["content"].split() == [
        "ocr_page",
        "ocr_table",
        "ocrx_block",
        "ocrx_line",
        "ocrx_word",
    ]
Exemple #18
0
def main():
    html = pdf.parse('PDFwork.pdf', html_path=None)

    df = format_df(html)

    cor_df = text_coordinate(html)

    index_list = check_index(cor_df)

    final_df = create_df(index_list, df)

    final_df.to_csv('final_data.csv')
Exemple #19
0
def test_looks_scanned():
    """Test on a PDF that looks like a scanned one but not.

    CaseStudy_ACS.pdf contains a transparent image overlaying the entire page.
    This overlaying transparent image fools TreeExtractor into thinking it is scanned.
    """
    output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf")
    soup = BeautifulSoup(output, "lxml")
    assert len(soup.find_all(class_="ocrx_word")) >= 1000
    assert len(soup.find_all("figure")) == 3

    # Check if words are extracted even though they are overlapped by a figure (#77).
    page = soup.find(
        class_="ocr_page")  # checking only 1st page is good enough.
    words = [get_bbox(word) for word in page.find_all(class_="ocrx_word")]
    figure = get_bbox(page.find("figure"))
    assert all([figure.contains(word) for word in words])
Exemple #20
0
def getSectionHeaders(casefile):
    tree = pdftotree.parse('/code/uploads/' + casefile,
                           html_path=None,
                           model_type=None,
                           model_path=None,
                           favor_figures=True,
                           visualize=False)
    parsed_html = BeautifulSoup(tree, features='html.parser')
    headers = parsed_html.find_all('section_header')
    titles = []
    remove_titles = []
    footer = headers[-1].text
    for header in headers:
        title = header.text
        if len(title) < 75 and not title.isnumeric() and title[0].istitle():
            titles.append(title)
        else:
            remove_titles.append(title)
    for i in range(0, len(titles)):
        exhibit_search = re.search('xhi', titles[i])
        if exhibit_search:
            exhibit_start = i
            break
    titles = titles[:exhibit_start]
    main_title = max(set(titles), key=titles.count)
    titles = [x.strip() for x in titles if x != main_title]
    final_headers = []
    for title in titles:
        split_title = title.split(' ')
        all_titles = []
        for splits in split_title:
            if len(splits) < 4:
                all_titles.append(True)
            elif len(splits) >= 4 and splits[0].istitle():
                if re.search(':', splits):
                    all_titles.append(False)
                else:
                    all_titles.append(True)
            else:
                all_titles.append(False)
        if all(all_titles):
            final_headers.append(title)
    return main_title.strip(), footer.strip(), final_headers, remove_titles
Exemple #21
0
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()

    if text:
        return text


if __name__ == '__main__':
    print(
        extract_text_from_pdf(
            'C:/Users/gjave/Desktop/Biergarten2018-BASISMENU-DRIELUIK-JUNI-FOOD.pdf'
        ))

from pdftablr.table_extractor import Extractor
import pdftotree
pdftotree.parse(
    'C:/Users/gjave/Desktop/Biergarten2018-BASISMENU-DRIELUIK-JUNI-FOOD.pdf',
    html_path=None,
    model_type=None,
    model_path=None,
    favor_figures=True,
    visualize=False)
Exemple #22
0
def test_visualize_output(tmp_path):
    """Test if an output can be visualzied."""
    html_path = os.path.join(tmp_path, "md.html")
    pdftotree.parse("tests/input/md.pdf", html_path, visualize=True)
Exemple #23
0
import pdftotree
import tkinter
from pathlib import Path
import os

input_path = Path(
    'C:\\Users\\Dan\'s Laptop\\OneDrive\\Documents\\Coding\\Python\\USF\\COVID-19 Project\\Data\\Unclean PDF'
)
output_path = Path(
    'C:\\Users\\Dan\'s Laptop\\OneDrive\\Documents\\Coding\\Python\\USF\\COVID-19 Project\\Data\\html'
)
file_n = len(os.listdir(input_path))
file_ct = 0
error_list = []
for files in os.listdir(input_path):
    file_ct += 1
    print(files)
    new_file_name = files.replace('.pdf', '')
    html = pdftotree.parse('Data\\Unclean PDF\\' + files)
    try:
        with open(f"{output_path}/{new_file_name}.html", 'w') as f:
            f.write(html)
    except:
        error_list.append(files)
    print(f'{round((file_ct/file_n),3)*100}% Complete')
    break
print(error_list)
Exemple #24
0
def test_cli_should_output_at_given_path(tmp_path):
    """Test if CLI produces an HTML at a given path."""
    html_path = os.path.join(tmp_path, "paleo.html")
    pdftotree.parse("tests/input/paleo.pdf", html_path)
    assert os.path.isfile(html_path)
Exemple #25
0
import pdftotree
filename = "sample.pdf"
outputDir = "/home/moshe/workspace/projects/Automation/mail_server/"
a = pdftotree.parse(outputDir + filename,
                    html_path=None,
                    model_type=None,
                    model_path=None,
                    visualize=False)
print(a)
Exemple #26
0
def test_heuristic_completion():
    """Simply test that parse runs to completion without errors."""
    output = pdftotree.parse("tests/input/paleo.pdf")
    assert output is not None
Exemple #27
0
def parse_pdf_file(file):
    pdf_to_html = pdftotree.parse(file,
                                  html_path=None,
                                  model_type=None,
                                  model_path=None,
                                  visualize=False)
    soup = BeautifulSoup(pdf_to_html, 'lxml')

    #Tika to get image links
    parsed = parser.from_file(file)
    content = parsed["content"]
    regex = r"(correct|wrong|should)\.png"
    urls = re.findall(regex, content)

    #Initial variables
    letters = ["A", "B", "C", "D", "E"]
    items = [[]]
    iteration = 0
    question = -1
    propositions = [[]]
    titles = []
    explanations = []

    question_block = False
    question_blocks = 0

    #Parsing the pdf file
    for line in soup.find_all("span", attrs={'class': 'ocrx_line'}):
        for word in line.find("span",
                              attrs={'class':
                                     'ocrx_word'}):  #Get the first occurence
            if (word.string in letters):

                if len(propositions[question]) == 5:
                    if not question_block:
                        print("Not in question block")
                    else:
                        print("--QUESTION {} END--".format(question))
                        print("\n")
                        question_block = False

                try:
                    #Check if a new question block begins
                    c = line.parent.find_previous_sibling("div").find(
                        "span", attrs={'class': 'ocrx_word'})
                    if c.string == "QUESTION":
                        #Checking for question integrity
                        if len(propositions[question]) == 5:
                            question_block = True
                            question_blocks += 1
                            items.append([])
                            propositions.append([])
                            question += 1
                            print("--QUESTION {} START--".format(question))

                        #Brand new question
                        elif len(propositions[question]
                                 ) == 0 and question == -1:
                            question_block = True
                            question_blocks += 1
                            items.append([])
                            propositions.append([])
                            question += 1
                            print("--QUESTION {} START--".format(question))

                        #If there aren't all the 5 propositions and this is not the first
                        else:
                            items[question].clear()
                            propositions[question].clear()
                            titles.pop()
                            question_block = True
                            print("--uncompleted question detected--")
                            print("\n")
                            print("--QUESTION {} START--".format(question))

                    #Checking for explanation section
                    if c.string == "Commentaire:":
                        question_block = False

                except:
                    question_block = False
                    print("An exception was detected!")

                #Checked supposed length of answers
                right_letter = False
                for letter, length in zip(letters, range(5)):
                    if word.string == letter:
                        right_letter = len(propositions[question]) == length
                        # print("{} is on position {} and should be on {}".format(letter, len(propositions[question]), length))

                #Get question title
                if (word.string == "A") and question_block and right_letter:
                    first = True
                    title = ""
                    for w in line.parent.find_previous_sibling("div").find_all(
                            "span", attrs={'class': 'ocrx_word'}):
                        title += "" if first else " "
                        title += w.string
                        if first:
                            first = False
                    title = re.sub(
                        r'QUESTION\sN°\s(\d)+\s', '',
                        title)  #Remove question number from the title
                    title = html.unescape(title)
                    titles.append(title)
                    print("-title detected")

                if right_letter:
                    print(word.string)

                #Match the correct answers
                if question_block and right_letter:
                    text = "t" if line.parent.find_previous_sibling(
                    ).name == "figure" else ""

                    #Append only correct and should answers
                    if text == "t":
                        try:
                            if urls[iteration] == "wrong":
                                text = ""
                        except:
                            text = ""
                        iteration += 1
                    items[question].append(word.string + text)

                #Get the item text
                if question_block and right_letter:
                    string = ""
                    first = True
                    for w in line.parent.find_all("span",
                                                  attrs={'class':
                                                         'ocrx_word'}):
                        string += "" if first else " "  #Adding space between words
                        string += w.string
                        if first:
                            first = False

                    reg1 = word.string + " - "  #Remove "A - "
                    reg2 = word.string + ". "  #Remove "A. "
                    string = re.sub(reg1, '', string)
                    string = re.sub(reg2, '', string)
                    string = html.unescape(string)
                    propositions[question].append(string)
                    # print("The proposition was appended!")

                #Get the explanation
                if question_block and right_letter:
                    if word.string == "E":
                        explanation = ""
                        first = True

                        #Adding extra check when accessing explanations
                        try:
                            if line.parent.find_next_sibling("div").find(
                                    "span", attrs={
                                        "class": "ocrx_word"
                                    }).string == "Commentaire:":
                                for w in line.parent.find_next_sibling(
                                        "div").find_all(
                                            "span",
                                            attrs={"class": "ocrx_word"}):
                                    explanation += "" if first else " "
                                    explanation += w.string
                                    if first:
                                        first = False
                        except:
                            explanation = ""

                        explanations.append(explanation)
                        print("-explanation detected")

    #Account for errors in the last question (not detected by the loop)
    if len(propositions[-1]) != 5 and len(titles) != 0:
        question_blocks -= 1
        propositions.pop()
        items.pop()
        titles.pop()

    #Get correct answers in a list that contains the letters
    correct_items = []
    for index in range(len(items)):
        correct_items.append(
            [item[0] for item in items[index] if len(item) > 1])

    print("\n")
    print("Results:")
    print("\tQuestions: {}".format(question_blocks))
    print("\tTitles: {}".format(len(titles)))
    print("\tExplanations: {}".format(len(explanations)))

    question_data = {}
    for idx in range(len(titles)):
        question_data[idx] = {
            "title": titles[idx],
            "itemA": propositions[idx][0],
            "itemB": propositions[idx][1],
            "itemC": propositions[idx][2],
            "itemD": propositions[idx][3],
            "itemE": propositions[idx][4],
            "correctA": True if "A" in correct_items[idx] else False,
            "correctB": True if "B" in correct_items[idx] else False,
            "correctC": True if "C" in correct_items[idx] else False,
            "correctD": True if "D" in correct_items[idx] else False,
            "correctE": True if "E" in correct_items[idx] else False,
            "explanation": explanations[idx]
        }

    return question_data
Exemple #28
0
# 
import pdftotree

filename = 'CH0382903356e.pdf'
outputfile = 'CH0382903356e.html'
pdftotree.parse(filename, outputfile, model_type=None, model_path=None, favor_figures=True, visualize=False):
    
    
Exemple #29
0
def test_overflowerror_should_not_happen():
    """Test if OverflowError does not happen (#104)."""
    output = pdftotree.parse(
        "tests/input/UACompanionSpecificationforIEC611313Model_p41.pdf")
    assert output is not None
import pdftotree

pdftotree.parse(pdf_file="doc.pdf")