def extract_images(doc): """If the given document (proposal.models.Document) has been copied to the local filesystem, extract its images to a subdirectory of the document's directory (docs/<doc id>/images). Extracts the text contents to docs/<doc id>/text.txt. :param doc: proposal.models.Document object with a corresponding PDF file that has been copied to the local filesystem :returns: A list of proposal.model.Image objects """ # TODO: Break this into smaller subtasks docfile = doc.document if not docfile: logger.error("Document has not been copied to the local filesystem.") return [] path = docfile.path if not os.path.exists(path): logger.error("Document %s is not where it says it is: %s", doc.pk, path) return [] images_dir = os.path.join(os.path.dirname(path), "images") os.makedirs(images_dir, exist_ok=True) images_pattern = os.path.join(images_dir, "image") logger.info("Extracting images to '%s'", images_dir) status = subprocess.call( ["pdfimages", "-png", "-tiff", "-j", "-jp2", path, images_pattern]) images = [] if status: logger.warn("pdfimages failed with exit code %i", status) else: # Do stuff with the images in the directory for image_name in os.listdir(images_dir): image_path = os.path.join(images_dir, image_name) if not is_interesting(image_path): # Delete 'uninteresting' images os.unlink(image_path) continue image = Image(proposal=doc.proposal, document=doc) image.image = image_path images.append(image) try: image.save() except IntegrityError: # This can occur if the image has already been fetched # and associated with the Proposal. pass return images
def extract_images(doc_id): """If the given document (proposal.models.Document) has been copied to the local filesystem, extract its images to a subdirectory of the document's directory (docs/<doc id>/images). :param doc: proposal.models.Document object with a corresponding PDF file that has been copied to the local filesystem :returns: A list of proposal.model.Image objects """ doc = Document.objects.get(pk=doc_id) docfile = doc.document if not docfile: logger.error("Document has not been copied to the local filesystem.") return [] path = docfile.path if not os.path.exists(path): logger.error("Document %s is not where it says it is: %s", doc.pk, path) return [] images_dir = os.path.join(os.path.dirname(path), "images") os.makedirs(images_dir, exist_ok=True) logger.info("Extracting images to '%s'", images_dir) image_paths = pdf.extract_images(path, dirname=images_dir) images = [] # Do stuff with the images in the directory for image_name in image_paths: image_path = os.path.join(images_dir, image_name) if not is_interesting(image_path): # Delete 'uninteresting' images os.unlink(image_path) continue image = Image(proposal=doc.proposal, document=doc) image.image = image_path images.append(image) try: image.save() except IntegrityError: # This can occur if the image has already been fetched # and associated with the Proposal. pass logger.info("Extracted %i image(s) from %s.", len(images), path) return [image.pk for image in images]
def extract_content(doc, encoding="ISO-8859-9"): """If the given document (proposal.models.Document) has been copied to the local filesystem, extract its images to a subdirectory of the document's directory (docs/<doc id>/images). Extracts the text content to docs/<doc id>/content.txt. """ docfile = doc.document logger = extract_content.get_logger() if not docfile: logger.error("Document has not been copied to the local filesystem.") return try: path = docfile.path except: path = docfile.name if not os.path.exists(path): logger.error("Document %s is not where it says it is: %s", doc.pk, path) return images_dir = os.path.join(os.path.dirname(path), "images") os.makedirs(images_dir, exist_ok=True) images_pattern = os.path.join(images_dir, "image") logger.info("Extracting images to '%s'", images_dir) status = subprocess.call(["pdfimages", "-png", "-tiff", "-j", "-jp2", path, images_pattern]) if status: logger.warn("pdfimages failed with exit code %i", status) else: # Do stuff with the images in the directory for image_name in os.listdir(images_dir): image_path = os.path.join(images_dir, image_name) if not images.is_interesting(image_path): # Delete 'uninteresting' images os.unlink(image_path) continue image = Image(proposal=doc.proposal, document=doc) image.image = image_path #image.set_image_path(image_path) try: image.save() except IntegrityError: # This can occur if the image has already been fetched # and associated with the Proposal. pass # Could consider storing the full extracted text of the document in # the database and indexing it, rather than extracting it to a file. text_path = os.path.join(os.path.dirname(path), "text.txt") # TODO: It may be practical to sniff pdfinfo, determine the PDF # producer used, and make a best guess at encoding based on that # information. We should be able to get away with using ISO-8859-9 # for now. status = subprocess.call(["pdftotext", "-enc", encoding, path, text_path]) if status: logger.error("Failed to extract text from {doc}".\ format(doc=path)) else: # Do stuff with the contents of the file. # Possibly perform some rudimentary scraping? doc.fulltext = text_path doc.encoding = encoding doc.save()
def extract_images(doc): """If the given document (proposal.models.Document) has been copied to the local filesystem, extract its images to a subdirectory of the document's directory (docs/<doc id>/images). Extracts the text contents to docs/<doc id>/text.txt. :param doc: proposal.models.Document object with a corresponding PDF file that has been copied to the local filesystem :returns: A list of proposal.model.Image objects """ # TODO: Break this into smaller subtasks docfile = doc.document logger = extract_images.get_logger() if not docfile: logger.error("Document has not been copied to the local filesystem.") return [] path = docfile.path if not os.path.exists(path): logger.error("Document %s is not where it says it is: %s", doc.pk, path) return [] images_dir = os.path.join(os.path.dirname(path), "images") os.makedirs(images_dir, exist_ok=True) images_pattern = os.path.join(images_dir, "image") logger.info("Extracting images to '%s'", images_dir) status = subprocess.call(["pdfimages", "-png", "-tiff", "-j", "-jp2", path, images_pattern]) images = [] if status: logger.warn("pdfimages failed with exit code %i", status) else: # Do stuff with the images in the directory for image_name in os.listdir(images_dir): image_path = os.path.join(images_dir, image_name) if not images.is_interesting(image_path): # Delete 'uninteresting' images os.unlink(image_path) continue image = Image(proposal=doc.proposal, document=doc) image.image = image_path images.append(image) try: image.save() except IntegrityError: # This can occur if the image has already been fetched # and associated with the Proposal. pass return images