def analyze_pdf(path): try: with open(path, 'rb') as f: metadata = PdfFileReader(f).getDocumentInfo() metadata = {re.sub('[^A-Za-z]+', '', k): v for k, v in metadata.items()} author = metadata.get('Author') title = metadata.get('Title') date = metadata.get('CreationDate') year = get_year_from_date_string(date) return (author, title, year, path, 'pdf') except: return None
def _get_pdf_page_dimensions(pdf_file_path, page_no): """ Gets the height and width of the pdf at the given page no after the rotation is applied. The default height and width are swapped when the pdf has a rotation of 90/270(vertical). :param pdf_file_path: File path of the input pdf :param page_no: Page no whose dimensions are returned :return: A tuple of the form (width, height) """ with open(pdf_file_path, 'rb') as file: pdf_file = PdfFileReader(file).getPage(page_no) media_box = pdf_file.mediaBox rotation = pdf_file.get('/Rotate') if utils.is_horizontal_orientation(rotation): w, h = media_box.getWidth(), media_box.getHeight() else: w, h = media_box.getHeight(), media_box.getWidth() return w, h