def test_ml_completion(): """Simply test that ML-based parse runs without errors.""" output = pdftotree.parse( "tests/input/paleo.pdf", model_type="ml", model_path="tests/input/paleo_model.pkl", ) assert output is not None
def test_LTChar_under_LTFigure(tmp_path): """Test on a PDF where LTChar(s) are children of LTFigure.""" html_path = os.path.join(tmp_path, "paleo.html") pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf", html_path) with open(html_path) as f: soup = BeautifulSoup(f, "lxml") line: Tag = soup.find(class_="ocrx_line") assert [word.text for word in line.find_all(class_="ocrx_word")] == [ "Small", "Signal", "Transistors", ] # The table in the 1st page should contain 18 columns page = soup.find(class_="ocr_page") table = page.find(class_="ocr_table") assert len(table.find("tr").find_all("td")) == 18 assert get_bbox(table) is not None # Find a cell containing one or more of ocrx_word and check if it has bbox cell = table.find(class_="ocrx_word").parent.parent assert get_bbox(cell) is not None with Popen(["hocr-check", html_path], stderr=PIPE) as proc: assert all( [line.decode("utf-8").startswith("ok") for line in proc.stderr])
def test_output_should_conform_to_hocr(tmp_path): """Test if an exported file conform to hOCR.""" html_path = os.path.join(tmp_path, "md.html") pdftotree.parse("tests/input/md.pdf", html_path) with Popen(["hocr-check", html_path], stderr=PIPE) as proc: assert all( [line.decode("utf-8").startswith("ok") for line in proc.stderr])
def test_no_out_of_order(caplog): """Test if no out of order warning is issued.""" pdftotree.parse("tests/input/md.pdf") assert "Out of order" not in caplog.text pdftotree.parse("tests/input/paleo.pdf") assert "Out of order" not in caplog.text
def converter(ifile: str, ofile: str): """ convert pdf to html using pdftotree :params ifile: input pdf file :params ofile: output html file """ pdftotree.parse(ifile, html_path=ofile, model_type=None, model_path=None, favor_figures=True, visualize=False)
def test_looks_scanned(): """Test on a PDF that looks like a scanned one but not. CaseStudy_ACS.pdf contains a transparent image overlaying the entire page. This overlaying transparent image fools TreeExtractor into thinking it is scanned. """ output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf", favor_figures="True") assert output.count("ocrx_word") == 1 # single appearance in ocr-capabilities output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf", favor_figures="False") assert output.count("ocrx_word") >= 1000
def convert_pdf_to_html(self, document_path): name, extension = self.get_document_name_and_extension(document_path) new_filename = name + '.' + 'html' location_path = os.path.join(MEDIA_ROOT, new_filename) pdftotree.parse(document_path, html_path=location_path, model_type=None, model_path=None, visualize=False) return location_path
def test_tabula_warning_suppressed(caplog): """Test if tabula warnings are suppressed.""" # Warnings suppressed by default pdftotree.parse("tests/input/112823.pdf") assert "org.apache.pdfbox" not in caplog.text # Not to suppress warnings log = logging.getLogger("pdftotree") log.setLevel(logging.DEBUG) pdftotree.parse("tests/input/112823.pdf") assert "org.apache.pdfbox" in caplog.text
def main(): file_lyst = [] for file in os.listdir("./Split"): filename = os.fsdecode(file) if filename.endswith(".pdf"): fyle = "./Split/" + filename file_lyst.append(fyle) for i in file_lyst: pdftotree.parse(i, html_path="./HTML/") print(i + " Converted to HTML \n")
def test_text_is_escaped(): """Test if text is properly escaped.""" output = pdftotree.parse("tests/input/md.pdf") soup = BeautifulSoup(output, "lxml") words = soup.find_all(class_="ocrx_word") # Use str() instead of .text as the latter gives unescaped text. m = re.search(r">(.+?)<", str(words[66])) assert m[1] == "'bar';." output = pdftotree.parse("tests/input/112823.pdf") soup = BeautifulSoup(output, "lxml") words = soup.find_all(class_="ocrx_word") m = re.search(r">(.+?)<", str(words[152])) assert m[1] == "&"
def test_figures(): output = pdftotree.parse("tests/input/md.pdf") soup = BeautifulSoup(output, "lxml") imgs = soup.find_all("img") assert len(imgs) == 1 output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf") soup = BeautifulSoup(output, "lxml") imgs = soup.find_all("img") # 3 jpg, 2 bmp, 5 total images assert len(imgs) == 5 assert len( [img for img in imgs if img["src"].startswith("data:image/jpeg")]) == 3 assert len( [img for img in imgs if img["src"].startswith("data:image/bmp")]) == 2
def visual_parsing(): """Simply test that ML-based parse runs without errors.""" output = pdftotree.parse(input_pdf_file, html_path=output_html_file, model_type="visual", model_path="input/paleo_visual_model.h5") return output
def test_vision_completion(): """Simply test that vision-based parse runs without errors.""" output = pdftotree.parse( "tests/input/paleo.pdf", model_type="vision", model_path="tests/input/paleo_visual_model.h5", ) assert output is not None
def test_vision_completion(): """Simply test that vision-based parse runs without errors.""" output = pdftotree.parse( "tests/input/paleo.pdf", model_type="vision", model_path="tests/input/paleo_visual_model.h5", ) soup = BeautifulSoup(output, "lxml") assert len(soup.find_all("table")) == 2
def heuristic_parsing(): """Simply test that parse runs to completion without errors.""" output = pdftotree.parse(input_pdf_file, html_path=output_html_file, model_type=None, model_path=None, favor_figures=True, visualize=False) return output
def test_cell_values_not_missing(): output = pdftotree.parse("tests/input/md.pdf") soup = BeautifulSoup(output, "lxml") table = soup.find(class_="ocr_table") assert list(table.find_all("tr")[3].stripped_strings) == [ "Erin", "lamb", "madras", "HOT", "$5", ]
def test_output_should_conform_to_hocr(tmp_path): """Test if an exported file conform to hOCR.""" html_path = os.path.join(tmp_path, "md.html") pdftotree.parse("tests/input/md.pdf", html_path) with Popen(["hocr-check", html_path], stderr=PIPE) as proc: assert all( [line.decode("utf-8").startswith("ok") for line in proc.stderr]) # Check detailed things that hocr-check does not check. with open(html_path) as fp: soup = BeautifulSoup(fp, "lxml") capabilities = soup.find("meta", attrs={"name": "ocr-capabilities"}) # Check the list as hocr-check only checks the existence of "ocr-capabilities". assert capabilities["content"].split() == [ "ocr_page", "ocr_table", "ocrx_block", "ocrx_line", "ocrx_word", ]
def main(): html = pdf.parse('PDFwork.pdf', html_path=None) df = format_df(html) cor_df = text_coordinate(html) index_list = check_index(cor_df) final_df = create_df(index_list, df) final_df.to_csv('final_data.csv')
def test_looks_scanned(): """Test on a PDF that looks like a scanned one but not. CaseStudy_ACS.pdf contains a transparent image overlaying the entire page. This overlaying transparent image fools TreeExtractor into thinking it is scanned. """ output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf") soup = BeautifulSoup(output, "lxml") assert len(soup.find_all(class_="ocrx_word")) >= 1000 assert len(soup.find_all("figure")) == 3 # Check if words are extracted even though they are overlapped by a figure (#77). page = soup.find( class_="ocr_page") # checking only 1st page is good enough. words = [get_bbox(word) for word in page.find_all(class_="ocrx_word")] figure = get_bbox(page.find("figure")) assert all([figure.contains(word) for word in words])
def getSectionHeaders(casefile): tree = pdftotree.parse('/code/uploads/' + casefile, html_path=None, model_type=None, model_path=None, favor_figures=True, visualize=False) parsed_html = BeautifulSoup(tree, features='html.parser') headers = parsed_html.find_all('section_header') titles = [] remove_titles = [] footer = headers[-1].text for header in headers: title = header.text if len(title) < 75 and not title.isnumeric() and title[0].istitle(): titles.append(title) else: remove_titles.append(title) for i in range(0, len(titles)): exhibit_search = re.search('xhi', titles[i]) if exhibit_search: exhibit_start = i break titles = titles[:exhibit_start] main_title = max(set(titles), key=titles.count) titles = [x.strip() for x in titles if x != main_title] final_headers = [] for title in titles: split_title = title.split(' ') all_titles = [] for splits in split_title: if len(splits) < 4: all_titles.append(True) elif len(splits) >= 4 and splits[0].istitle(): if re.search(':', splits): all_titles.append(False) else: all_titles.append(True) else: all_titles.append(False) if all(all_titles): final_headers.append(title) return main_title.strip(), footer.strip(), final_headers, remove_titles
with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() if text: return text if __name__ == '__main__': print( extract_text_from_pdf( 'C:/Users/gjave/Desktop/Biergarten2018-BASISMENU-DRIELUIK-JUNI-FOOD.pdf' )) from pdftablr.table_extractor import Extractor import pdftotree pdftotree.parse( 'C:/Users/gjave/Desktop/Biergarten2018-BASISMENU-DRIELUIK-JUNI-FOOD.pdf', html_path=None, model_type=None, model_path=None, favor_figures=True, visualize=False)
def test_visualize_output(tmp_path): """Test if an output can be visualzied.""" html_path = os.path.join(tmp_path, "md.html") pdftotree.parse("tests/input/md.pdf", html_path, visualize=True)
import pdftotree import tkinter from pathlib import Path import os input_path = Path( 'C:\\Users\\Dan\'s Laptop\\OneDrive\\Documents\\Coding\\Python\\USF\\COVID-19 Project\\Data\\Unclean PDF' ) output_path = Path( 'C:\\Users\\Dan\'s Laptop\\OneDrive\\Documents\\Coding\\Python\\USF\\COVID-19 Project\\Data\\html' ) file_n = len(os.listdir(input_path)) file_ct = 0 error_list = [] for files in os.listdir(input_path): file_ct += 1 print(files) new_file_name = files.replace('.pdf', '') html = pdftotree.parse('Data\\Unclean PDF\\' + files) try: with open(f"{output_path}/{new_file_name}.html", 'w') as f: f.write(html) except: error_list.append(files) print(f'{round((file_ct/file_n),3)*100}% Complete') break print(error_list)
def test_cli_should_output_at_given_path(tmp_path): """Test if CLI produces an HTML at a given path.""" html_path = os.path.join(tmp_path, "paleo.html") pdftotree.parse("tests/input/paleo.pdf", html_path) assert os.path.isfile(html_path)
import pdftotree filename = "sample.pdf" outputDir = "/home/moshe/workspace/projects/Automation/mail_server/" a = pdftotree.parse(outputDir + filename, html_path=None, model_type=None, model_path=None, visualize=False) print(a)
def test_heuristic_completion(): """Simply test that parse runs to completion without errors.""" output = pdftotree.parse("tests/input/paleo.pdf") assert output is not None
def parse_pdf_file(file): pdf_to_html = pdftotree.parse(file, html_path=None, model_type=None, model_path=None, visualize=False) soup = BeautifulSoup(pdf_to_html, 'lxml') #Tika to get image links parsed = parser.from_file(file) content = parsed["content"] regex = r"(correct|wrong|should)\.png" urls = re.findall(regex, content) #Initial variables letters = ["A", "B", "C", "D", "E"] items = [[]] iteration = 0 question = -1 propositions = [[]] titles = [] explanations = [] question_block = False question_blocks = 0 #Parsing the pdf file for line in soup.find_all("span", attrs={'class': 'ocrx_line'}): for word in line.find("span", attrs={'class': 'ocrx_word'}): #Get the first occurence if (word.string in letters): if len(propositions[question]) == 5: if not question_block: print("Not in question block") else: print("--QUESTION {} END--".format(question)) print("\n") question_block = False try: #Check if a new question block begins c = line.parent.find_previous_sibling("div").find( "span", attrs={'class': 'ocrx_word'}) if c.string == "QUESTION": #Checking for question integrity if len(propositions[question]) == 5: question_block = True question_blocks += 1 items.append([]) propositions.append([]) question += 1 print("--QUESTION {} START--".format(question)) #Brand new question elif len(propositions[question] ) == 0 and question == -1: question_block = True question_blocks += 1 items.append([]) propositions.append([]) question += 1 print("--QUESTION {} START--".format(question)) #If there aren't all the 5 propositions and this is not the first else: items[question].clear() propositions[question].clear() titles.pop() question_block = True print("--uncompleted question detected--") print("\n") print("--QUESTION {} START--".format(question)) #Checking for explanation section if c.string == "Commentaire:": question_block = False except: question_block = False print("An exception was detected!") #Checked supposed length of answers right_letter = False for letter, length in zip(letters, range(5)): if word.string == letter: right_letter = len(propositions[question]) == length # print("{} is on position {} and should be on {}".format(letter, len(propositions[question]), length)) #Get question title if (word.string == "A") and question_block and right_letter: first = True title = "" for w in line.parent.find_previous_sibling("div").find_all( "span", attrs={'class': 'ocrx_word'}): title += "" if first else " " title += w.string if first: first = False title = re.sub( r'QUESTION\sNĀ°\s(\d)+\s', '', title) #Remove question number from the title title = html.unescape(title) titles.append(title) print("-title detected") if right_letter: print(word.string) #Match the correct answers if question_block and right_letter: text = "t" if line.parent.find_previous_sibling( ).name == "figure" else "" #Append only correct and should answers if text == "t": try: if urls[iteration] == "wrong": text = "" except: text = "" iteration += 1 items[question].append(word.string + text) #Get the item text if question_block and right_letter: string = "" first = True for w in line.parent.find_all("span", attrs={'class': 'ocrx_word'}): string += "" if first else " " #Adding space between words string += w.string if first: first = False reg1 = word.string + " - " #Remove "A - " reg2 = word.string + ". " #Remove "A. " string = re.sub(reg1, '', string) string = re.sub(reg2, '', string) string = html.unescape(string) propositions[question].append(string) # print("The proposition was appended!") #Get the explanation if question_block and right_letter: if word.string == "E": explanation = "" first = True #Adding extra check when accessing explanations try: if line.parent.find_next_sibling("div").find( "span", attrs={ "class": "ocrx_word" }).string == "Commentaire:": for w in line.parent.find_next_sibling( "div").find_all( "span", attrs={"class": "ocrx_word"}): explanation += "" if first else " " explanation += w.string if first: first = False except: explanation = "" explanations.append(explanation) print("-explanation detected") #Account for errors in the last question (not detected by the loop) if len(propositions[-1]) != 5 and len(titles) != 0: question_blocks -= 1 propositions.pop() items.pop() titles.pop() #Get correct answers in a list that contains the letters correct_items = [] for index in range(len(items)): correct_items.append( [item[0] for item in items[index] if len(item) > 1]) print("\n") print("Results:") print("\tQuestions: {}".format(question_blocks)) print("\tTitles: {}".format(len(titles))) print("\tExplanations: {}".format(len(explanations))) question_data = {} for idx in range(len(titles)): question_data[idx] = { "title": titles[idx], "itemA": propositions[idx][0], "itemB": propositions[idx][1], "itemC": propositions[idx][2], "itemD": propositions[idx][3], "itemE": propositions[idx][4], "correctA": True if "A" in correct_items[idx] else False, "correctB": True if "B" in correct_items[idx] else False, "correctC": True if "C" in correct_items[idx] else False, "correctD": True if "D" in correct_items[idx] else False, "correctE": True if "E" in correct_items[idx] else False, "explanation": explanations[idx] } return question_data
# import pdftotree filename = 'CH0382903356e.pdf' outputfile = 'CH0382903356e.html' pdftotree.parse(filename, outputfile, model_type=None, model_path=None, favor_figures=True, visualize=False):
def test_overflowerror_should_not_happen(): """Test if OverflowError does not happen (#104).""" output = pdftotree.parse( "tests/input/UACompanionSpecificationforIEC611313Model_p41.pdf") assert output is not None
import pdftotree pdftotree.parse(pdf_file="doc.pdf")