def printLine(self, line): sty = style.currentStyle() fontName = sty.font.name fontSize = sty.fontsize lineWidth = fitz.getTextlength(line, fontName, fontSize) budget = width - margin - self.indent - self.insertPoint.x if lineWidth < budget: self.printSegment(line) else: prefix = line suffix = line while True: index = line.rfind(" ", 0, len(prefix)) if index == -1: # No spaces found. Print it on the next line self.cr(chr(0xAC)) self.printSegment(suffix) break prefix = line[:index] prefixWidth = fitz.getTextlength(prefix, fontname=fontName, fontsize=fontSize) if prefixWidth > budget: # Still too large. Chop again. continue else: self.printSegment(prefix) suffix = line[index + 1:] # Recurse for the remainder. It might be too long as well. self.cr(chr(0xAC)) self.printLine(suffix) break
def createTextbox(textfile, textboxRect, page, path): fontSize = 12 linecounter = 0 # Extracts text from file and eliminates blank lines # Counts the number of lines in the file with open(textfile) as input_txt: lineString = "" lineArray = [] for line in input_txt: if not line.isspace(): lineString = lineString + line lineArray.append(line) linecounter = linecounter + 1 lineLength = fitz.getTextlength(line, "Times-Roman", fontSize) while (lineLength > textboxRect.width): linecounter = linecounter + 1 lineLength = lineLength - textboxRect.width # At font size 12, textbox can hold up to 10 lines of text # Decreases font size by increments of 4 while line capacity is exceeded lineCapacity = 10 while (linecounter > lineCapacity): # Line capacity of textbox increases by the percent increase from each font size decrement # Ex: 12 --> 8 makes for a 50% increase in line capacity percentIncrease = calculatePercentIncrease(fontSize, fontSize - 2) lineCapacity = lineCapacity + (lineCapacity * percentIncrease) # Decrement font size by increment of 2 fontSize = fontSize - 2 linecounter = 0 # Recalculate the number of lines # This is necessary since decreasing font size may decrease the number of lines # When text exceeds horizontal boundaries, a new line is started for line in lineArray: linecounter = linecounter + 1 lineLength = fitz.getTextlength(line, "Times-Roman", fontSize) while (lineLength > textboxRect.width): linecounter = linecounter + 1 lineLength = lineLength - textboxRect.width # Generate textbox with text textboxShape = page.new_shape() textboxShape.draw_rect(textboxRect) textboxShape.finish(color=getColor("dodgerblue4"), fill_opacity=1, closePath=False) textboxShape.commit() page.insertTextbox(textboxRect, lineString, fontsize=fontSize, fontname="Times-Roman", align=0) return
def insert_fields(self, fields, form_type, PdfProcess): for _, field in enumerate(fields): document_id = field["document_id"] value = field["value"].upper() location = field["location"] index = field["index"] if "index" in field else 0 rect = PdfProcess.find_text(document_id, index=index) if rect is None: continue max_width = 0 if form_type == "con29r": max_width = 200 elif form_type == "llc1": max_width = 200 elif form_type == "con29o": max_width = 175 point = (rect.x1 + 2, rect.y0) if location == "below": point = (rect.x0, rect.y0 + rect.height) elif location == "before": text_length = fitz.getTextlength(value, fontsize=9) point = (rect.x1 - rect.width - text_length, rect.y0 - 1) rect = fitz.Rect(point[0], point[1], point[0] + max_width, point[1] + 200) PdfProcess.insert_textbox(value, rect, fontsize=9)
def append_TOC(existingDoc, newDoc, entryname, filename, startPoint, path): # Appends the new page to the existing document that contains the table of contents existingDoc.insert_pdf(newDoc) TOC_page = existingDoc.load_page(page_id=0) # Generates the text entry for the new page TOC_page.insertText(startPoint, entryname, fontname="helv", fontsize=16, rotate=0) x_distance = (fitz.getTextlength(entryname, fontname="helv", fontsize=16)) + 105 targetPageNumber = existingDoc.page_count entrynumber = " %i" % targetPageNumber while(x_distance < 475): dotLocation = fitz.Point(x_distance, startPoint.y) TOC_page.insertText(dotLocation, ".", fontname="helv", fontsize=16, rotate=0) x_distance = x_distance + 5 TOC_page.insertText(dotLocation, entrynumber, fontname="helv", fontsize=16, rotate=0) # Creates the hyperlink for the newly appended page # When the entry is clicked on in the Table of Contents, user is sent to that particular page linkRect = Rect(100, startPoint.y-20, x_distance + 25, startPoint.y + 15) newLink = TOC_page.insert_link({'kind': 1, 'from': linkRect, 'type': 'goto', 'page': targetPageNumber-1, 'to': fitz.Point(0, 0), 'zoom': 0.0}) # Inserts the page number on the bottom of the newly appended page insertedPage = existingDoc.load_page(page_id=-1) pageNumberPoint = fitz.Point(294, 830) insertPageNumber = "%i" % targetPageNumber insertedPage.insertText(pageNumberPoint, insertPageNumber, fontname="Times-Roman", fontsize=14, rotate=0) # Calculates the new start point for the next entry and saves the pdf newStartPoint = fitz.Point(100, startPoint.y + 35) existingDoc.save(path + filename) return newStartPoint
def printSegment(self, line): sty = style.currentStyle() fontName = sty.font.name fontSize = sty.fontsize lineWidth = fitz.getTextlength(line, fontname=fontName, fontsize=fontSize) # self.currentPage.insertText( # self.insertPoint, # str(self.insertPoint.x), # fontname="cobo", # fontsize=fontSize * 2, # ) log.debug(f"printSegment: {line}") self.currentPage.insertText(self.insertPoint, line, fontname=fontName, fontsize=fontSize) if self.linkDestination: self.linkRects.append( fitz.Rect( self.insertPoint.x, self.insertPoint.y - lineheight, self.insertPoint.x + lineWidth, self.insertPoint.y, )) self.insertPoint.x += lineWidth
def printRight(self, line): sty = style.currentStyle() lineWidth = fitz.getTextlength(line, fontname=sty.font.name, fontsize=sty.fontsize) self.insertPoint.x = width - margin - lineWidth self.currentPage.insertText(self.insertPoint, line, fontname=sty.font.name, fontsize=sty.fontsize)
def softbreak(self, node=None, entering=None): sty = style.currentStyle() fontName = sty.font.name fontSize = sty.fontsize lineWidth = fitz.getTextlength(" ", fontname=fontName, fontsize=fontSize) budget = width - margin - self.insertPoint.x if lineWidth < budget: self.printSegment(" ") else: pass # We're about to to a hard break anyway.
def apply(original, tokens: List[PDFToken], outfile, highlight=False): pdf_original = fitz.open(str(original)) pdf_corrected = fitz.open() PDFTokenizer.log.info('Copying images from original to corrected PDF') for page in pdf_original: PDFTokenizer.log.debug(f'(page {page.number})') newpage = pdf_corrected.newPage(-1, page.rect.width, page.rect.height) for image_info in page.getImageList(): xref = image_info[0] stream = pdf_original.extractImage(xref)['image'] newpage.insertImage(page.rect, stream=stream) blue = fitz.utils.getColor('blue') red = fitz.utils.getColor('red') PDFTokenizer.log.info('Inserting tokens in corrected PDF') for token in sorted(tokens, key=lambda x: x.ordering): if token.is_discarded: continue page = pdf_corrected[token.ordering[0]] word = token.gold or token.original # Adjust rectangle to fit word: fontfactor = 0.70 size = token.rect.height * fontfactor textwidth = fitz.getTextlength(word, fontsize=size) rect = fitz.Rect( token.rect.x0, token.rect.y0, max(token.rect.x1, token.rect.x0 + textwidth + 1.0), token.rect.y1 + token.rect.height) res = page.insertTextbox(rect, f'{word} ', fontsize=size, render_mode=3) if res < 0: PDFTokenizer.log.warning( f'Token was not inserted properly: {word}\n' f' -- token.rect: {token.rect}\n' f' -- rect: {rect}\n' f' -- font size: {size}\n' f' -- calc.width: {textwidth} rect.width: {rect.width}\n' f' -- rect.height: {rect.height} result: {res}\n') if highlight: page.drawRect(rect, color=red) elif highlight: page.drawRect(rect, color=blue) PDFTokenizer.log.info(f'Saving corrected PDF to {outfile}') pdf_corrected.save(str(outfile)) #, garbage=4, deflate=True)
def generateTOC(existingDoc, filename, path): generatedPage = existingDoc.newPage(pno=0) # Generates the Table of Contents Title tableOfContentsText = "-- Table of Contents --" TOC_textLength = fitz.getTextlength(tableOfContentsText) TOC_startPoint_X = ((595 / 2) - TOC_textLength) TOC_startPoint_Y = 85 TOC_startPoint = fitz.Point(TOC_startPoint_X, TOC_startPoint_Y) generatedPage.insertText(TOC_startPoint, tableOfContentsText, fontname="Times-Roman", color=(0, 0.35, 0.8), fontsize=24, rotate=0) # Inserts the page number at the bottom of the page. # Table of Contents will be page 1 pageNumberPoint = fitz.Point(294, 815) generatedPage.insertText(pageNumberPoint, "1", fontname="Times-Roman", fontsize=14, rotate=0) existingDoc.save(path + filename) return existingDoc
def apply(original, tokens: List[HOCRToken], outfile: Path, highlight=False): pdf = fitz.open() pix = fitz.Pixmap(str(original)) page = pdf.newPage(-1, width=pix.width, height=pix.height) page.insertImage(page.rect, pixmap=pix) for token in progressbar.progressbar(tokens): if token.is_discarded: continue page = pdf[token.page] word = token.gold or token.original # Adjust rectangle to fit word: fontfactor = 0.70 size = token.rect.height * fontfactor textwidth = fitz.getTextlength(word, fontsize=size) rect = fitz.Rect( token.rect.x0, token.rect.y0, max(token.rect.x1, token.rect.x0 + textwidth + 1.0), token.rect.y1 + token.rect.height * 2) res = page.insertTextbox(rect, f'{word} ', fontsize=size, color=(1, 0, 0)) if res < 0: HOCRTokenizer.log.warning( f'Token was not inserted properly: {word}\n' f' -- token.rect: {token.rect}\n' f' -- rect: {rect}\n' f' -- font size: {size}\n' f' -- calc.width: {textwidth} rect.width: {rect.width}\n' f' -- rect.height: {rect.height} result: {res}\n') pdf.save(str(outfile.with_suffix('.pdf')))
def insert_extra_info(extra, exam, test_mode=False, test_folder=None): """Creates the extra info (ususally student name and id) boxes and places them in the first page. Arguments: extra {dict} -- (Str:Str) dictioary with student id and name. exam {fitz.Document} -- PDF document type returned as the exam, similar to a dictionary with the ge numbers as the keys. Keyword Arguments: test_mode {bool} -- Boolean elements used for testing, testing case with show the documents. (default: {False}) test_folder {Str} -- String for where to place the generated test files. (default: {None}) Raises: ValueError: Raise error if the student name and number is not encodable. Returns: fitz.Document -- The same exam object as the input, except we add the extra infor into the first page. """ # Get page width and height page_width = exam[0].bound().width page_height = exam[0].bound().height student_id = extra["id"] student_name = extra["name"] # a file for the student-details YSHIFT = 0.4 # where on page is centre of box 0=top, 1=bottom # Creating the student id \n name text file txt = "{}\n{}".format(student_id, student_name) sign_here = "Please sign here" # Getting the dimentions of the box try: student_id_width = (max( fitz.get_text_length(student_id, fontsize=36, fontname="Helvetica"), fitz.get_text_length( student_name, fontsize=36, fontname="Helvetica"), fitz.get_text_length(sign_here, fontsize=48, fontname="Helvetica"), ) * 1.1 * 0.5) except AttributeError: # workaround https://github.com/pymupdf/PyMuPDF/issues/1085 # TODO: drop this code when we dependent on PyMuPDF>=1.18.14 student_id_width = (max( fitz.getTextlength(student_id, fontsize=36, fontname="Helvetica"), fitz.getTextlength(student_name, fontsize=36, fontname="Helvetica"), fitz.getTextlength(sign_here, fontsize=48, fontname="Helvetica"), ) * 1.1 * 0.5) student_id_height = 36 * 1.3 # We have 2 rectangles for the student name and student id student_id_rect_1 = fitz.Rect( page_width // 2 - student_id_width, page_height * YSHIFT - student_id_height, page_width // 2 + student_id_width, page_height * YSHIFT + student_id_height, ) student_id_rect_2 = fitz.Rect( student_id_rect_1.x0, student_id_rect_1.y1, student_id_rect_1.x1, student_id_rect_1.y1 + 48 * 1.3, ) exam[0].draw_rect(student_id_rect_1, color=[0, 0, 0], fill=[1, 1, 1], width=2) exam[0].draw_rect(student_id_rect_2, color=[0, 0, 0], fill=[1, 1, 1], width=2) # TODO: This could be put into one function # Also VALA doesn't understand the TODO s if is_possible_to_encode_as(txt, "Latin-1"): fontname = "Helvetica" elif is_possible_to_encode_as(txt, "gb2312"): # TODO: Double-check encoding name. Add other CJK (how does Big5 # vs GB work?). Check printers can handle these or do we need to # embed a font? (Adobe Acrobat users need to download something) fontname = "china-ss" else: # TODO: or warn use Helvetica, get "?" chars raise ValueError( "Don't know how to write name {} into PDF".format(txt)) # We insert the student id text boxes insertion_confirmed = exam[0].insert_textbox( student_id_rect_1, txt, fontsize=36, color=[0, 0, 0], fontname=fontname, fontfile=None, align=1, ) # TODO: VALA suggests we do the insertion_confirmed check here as well assert (insertion_confirmed > 0), "Text didn't fit: shortname too long? or font issue/bug?" # We insert the student name text boxes insertion_confirmed = exam[0].insert_textbox( student_id_rect_2, sign_here, fontsize=48, color=[0.9, 0.9, 0.9], fontname="Helvetica", fontfile=None, align=1, ) # TODO: VALA suggests we do the insertion_confirmed check here as well assert (insertion_confirmed > 0), "Text didn't fit: shortname too long? or font issue/bug?" return exam
print("PDF conversion results for file '%s':" % (ifn,)) print(out_ctr, "lines read,", total_ctr, "lines written,", nlines, "lines per page.") print(ofn, "contains", len(doc), "pages.") # Now add some header and footer to each created page hdr_fontsz = 16 # header fontsize ftr_fontsz = 8 # footer fontsize blue = (0, 0, 1) # header / footer color pspace = 500 # available line width for page in doc: footer = "%i (%i)" % (page.number + 1, len(doc)) # footer text plen_ftr = fitz.getTextlength(footer, fontname="Helvetica", fontsize=ftr_fontsz) page.insertText(fitz.Point(50, 50), ifn, # header = input filename color = blue, fontsize = hdr_fontsz) page.drawLine(fitz.Point(50,60), fitz.Point(50 + pspace,60), # line below hdr color= blue, width = 0.5) page.drawLine(fitz.Point(50, height-33), # line above footer fitz.Point(50 + pspace, height - 33), color = blue, width = 0.5) page.insertText(fitz.Point(50 + pspace - plen_ftr, # insert footer height - 33 + ftr_fontsz * 1.2), footer, fontsize = ftr_fontsz, color = blue)
def generateDoc(title, data, dataStart, dataEnd, bio1, bio2, bio3, bio4, vid_filename, textfile, filename, pageNum, path): # Creates a new blank PDF doc = fitz.open() generatedPage = doc.newPage() font = "Times-Roman" fontSize = 24 titleLength = fitz.getTextlength(title, font, fontSize) # Prints the dimensions of the newly generated page. # These values may be useful for determining the locations of the plots pageRect = generatedPage.bound() page_x0 = pageRect.x0 page_x1 = pageRect.x1 # Ensures that the title will always be centered, despite text length pageMidpoint_X = (page_x1 - page_x0) / 2 titleStartPoint_X = pageMidpoint_X - (titleLength / 2) titleStartPoint_Y = fontSize + 11 titleStartPoint = fitz.Point(titleStartPoint_X, titleStartPoint_Y) generatedPage.insertText(titleStartPoint, title, fontname=font, fontsize=fontSize, rotate=0) # Autogenerates the biometric plots bio1_filename = path + "page%i_" % pageNum + bio1 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio1, bio1_filename) bio2_filename = path + "page%i_" % pageNum + bio2 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio2, bio2_filename) bio3_filename = path + "page%i_" % pageNum + bio3 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio3, bio3_filename) bio4_filename = path + "page%i_" % pageNum + bio4 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio4, bio4_filename) # Autogenerates the EEG heatmaps eeg.eeg_viz(data, dataStart, dataEnd, path + "page%i_eeg_" % pageNum) fontSize = 14 # Extracts a frame from the video in the specified time range # extracted_frame_filename = path + "page%i_extracted_frame.jpg"% pageNum # vidFrame.extractFrame(vid_filename, dataStart, dataEnd, extracted_frame_filename) # Inserts heatmap visualizations heatmapAlpha_Location = fitz.Rect(10, 50, 198, 238) generatedPage.insertImage(heatmapAlpha_Location, filename=path + "page%i_eeg_alpha.png" % pageNum, keep_proportion=False) alphaText = "Alpha Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((10 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, alphaText, fontname=font, fontsize=fontSize, rotate=0) heatmapBeta_Location = fitz.Rect(203, 50, 391, 238) generatedPage.insertImage(heatmapBeta_Location, filename=path + "page%i_eeg_beta.png" % pageNum, keep_proportion=False) betaText = "Beta Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((203 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, betaText, fontname=font, fontsize=fontSize, rotate=0) heatmapTheta_Location = fitz.Rect(396, 50, 585, 238) generatedPage.insertImage(heatmapTheta_Location, filename=path + "page%i_eeg_theta.png" % pageNum, keep_proportion=False) thetaText = "Theta Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((396 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, thetaText, fontname=font, fontsize=fontSize, rotate=0) # Inserts biometric plots bio3_Location = fitz.Rect(10, 443, 300, 653) generatedPage.insertImage(bio3_Location, filename=bio3_filename, keep_proportion=False) bio4_Location = fitz.Rect(305, 443, 595, 653) generatedPage.insertImage(bio4_Location, filename=bio4_filename, keep_proportion=False) bio1_Location = fitz.Rect(10, 253, 300, 463) generatedPage.insertImage(bio1_Location, filename=bio1_filename, keep_proportion=False) bio2_Location = fitz.Rect(305, 253, 595, 463) generatedPage.insertImage(bio2_Location, filename=bio2_filename, keep_proportion=False) # Generates textbox textboxBack_Location = fitz.Rect(250, 650, 585, 815) textHandler.createTextbox(textfile, textboxBack_Location, generatedPage, path) # Inserts video frame vidFrame_Location = fitz.Rect(25, 675, 245, 799) # generatedPage.insertImage(vidFrame_Location, filename=extracted_frame_filename, keep_proportion=False) generatedPage.insertImage(vidFrame_Location, filename=vid_filename, keep_proportion=False) # Saves the PDF -- not needed anymore #doc.save(filename) return doc
for block in image_blocks: outpage.insertImage(block["bbox"], stream=block["image"]) print("Inserted an image on page", page.number) for block in text_blocks: # read text blocks shape.drawRect(block["bbox"]) # draw all text on white background, # because images may cover same area for line in block["lines"]: # for each line in the block ... for span in line["spans"]: # for each span in the line ... fontname = get_font(span["font"]) # get replacing fontname fontsize = span["size"] text = span["text"] bbox = fitz.Rect(span["bbox"]) # text rectangle on input text_size = fitz.getTextlength( # measure text length on output text, fontname=fontname, fontsize=fontsize ) # adjust fontsize if text is too long with new the font if text_size > bbox.width: fontsize *= bbox.width / text_size try: shape.insertText( # copy text to output page bbox.bl, # insertion point on output page text, # the text to insert fontsize=fontsize, # fontsize # decide on output font here: the place for sophistication! fontname=fontname, color=pdf_color(span["color"]), ) except ValueError: