Example #1
0
def convert_pdf(filename, output_path, resolution=150):
    """ Convert a PDF into images.

        All the pages will give a single png file with format:
        {pdf_filename}-{page_number}.png

        The function removes the alpha channel from the image and
        replace it with a white background.
    """
    all_pages = Image(filename=filename, resolution=resolution)
    for i, page in enumerate(all_pages.sequence):
        with Image(page) as img:
            img.format = 'png'
            img.background_color = Color('white')
            img.alpha_channel = 'remove'

            image_filename = os.path.splitext(os.path.basename(filename))[0]
            if i == 0:
                image_filename = '{}.png'.format(image_filename)
            else:
                image_filename = '{}-{}.png'.format(image_filename, i)
            image_filename = os.path.join(output_path, image_filename)

            img.save(filename=image_filename)
Example #2
0
    def image_to_jpeg_wand(self, jpeg, preview_dims=None):
        '''
        for jpeg, gif and bmp
        :param jpeg:
        :param size:
        :return:
        '''
        logging.info('Converting image to jpeg using wand')

        with WImage(file=jpeg, background=Color('white')) as image:

            preview_dims = ImgDims(width=preview_dims.width,
                                   height=preview_dims.height)

            resize_dim = compute_resize_dims(dims_in=ImgDims(
                width=image.size[0], height=image.size[1]),
                                             dims_out=preview_dims)
            image.resize(resize_dim.width, resize_dim.height)

            content_as_bytes = image.make_blob('jpeg')
            output = BytesIO()
            output.write(content_as_bytes)
            output.seek(0, 0)
            return output
Example #3
0
 # ----get_model----
 net = PixelAnchornet(pretrained=False)  # .to(device)
 model_checkpoint = torch.load(pre_model_weight,
                               map_location=torch.device('cpu'))
 net.load_state_dict(model_checkpoint)
 net.eval()
 for i, sample in enumerate(sample_list):
     images = []
     img_path = os.path.join(sample_dir, sample)
     if sample[-4:] == '.pdf':
         with Image(filename=img_path, resolution=(200, 200)) as imgs:
             num_page = len(imgs.sequence)
             for i in range(num_page):
                 img = Image(image=imgs.sequence[i])
                 img.alpha_channel = False
                 img.background_color = Color(
                     'white')  # Set the background color
                 img = PI.fromarray(np.array(img), 'RGB')
                 im_name = sample[:-4] + '_p' + str(i)
                 images.append((img, im_name))
     else:
         im_name = sample[:-4]
         img = PI.open(img_path).convert('RGB')
         images = [(img, im_name)]
     for img, im_name in images:
         print('-----------------传入图片的img.size:', img.size)
         pixle_anchor_detect(img,
                             net,
                             NMS_choice='ssd',
                             img_save_pths=os.path.join(
                                 image_save_path, f'{im_name}_{i}.jpg'),
                             img_size=2048)
Example #4
0
def pattern_tile(import_png, format, dir_path):

    # formats accepted
    format = format.lower()  #make case-insensitive

    if (format == 'letter'):
        tile_width = 7.5
        tile_height = 10
        guides_png = 'guides/cut_guides_letter.png'
    elif (format == 'tabloid'):
        tile_width = 15
        tile_height = 10
        guides_png = 'guides/cut_guides_tabloid.png'
    elif (format == 'a0'):
        tile_width = 45
        tile_height = 30
        guides_png = None
    else:
        print("Format must be letter, tabloid, or a0. Case sensitive.")
        exit()

    # constants
    ppi = 300
    ppi_width = round(tile_width * ppi)
    ppi_height = round(tile_height * ppi)

    #### Full size Exported PNG operations ####

    # open the import_png
    with Image(filename=import_png) as img:
        img.units = 'pixelsperinch'
        img.resolution = 300
        print(import_png + ' is ' + str(img.width / ppi) + ' by ' +
              str(img.height / ppi) + ' at ' + str(img.resolution[0]) + ' ' +
              img.units)

        #### Alignment Guides ####
        if guides_png:
            with Image(filename=guides_png) as guides:
                img.composite(guides, left=0, top=0)

        #### Tile Img ####
        currentx = 0
        currenty = 0

        i = 1

        while currenty < img.height:
            while currentx < img.width:
                with img.clone() as cloned:
                    cloned.crop(left=currentx,
                                top=currenty,
                                width=ppi_width,
                                height=ppi_height)
                    tile_id = dir_path + os.path.splitext(
                        import_png)[0] + "-" + str(i).zfill(2) + ".png"
                    cloned.save(filename=tile_id)
                currentx += ppi_width
                i += 1
            currenty += ppi_height
            currentx = 0
        print("Tiled", i - 1, format, "pattern sheets")

    #### Single Tile Operations ####

    # add tiled pngs into a list
    if glob.glob(dir_path + '*-*.png'):
        png_list = sorted(glob.glob(dir_path + '*-*.png'))

    pg_num = 1

    for i in png_list:

        with Image(filename=i) as img:
            img.units = 'pixelsperinch'
            img.resolution = 300

            # remove alpha channel and fill white
            if img.alpha_channel:
                print("Removing alpha channel from ", i)
                img.alpha_channel = 'remove'  #close alpha channel
                img.background_color = Color('white')
                #img.save(filename=new_image_path)

            # Add border 0.5 in = 150/300
            img.border('white', 150, 150)

            # Add lines and texts
            with Drawing() as draw:
                # cut registration lines
                draw.push()
                draw.stroke_color = Color('grey50')
                draw.stroke_width = 3
                draw.fill_opacity = 0
                draw.path_start()
                draw.path_move(to=(img.width, 150))  #top
                draw.path_horizontal_line(1)
                draw.path_move(to=(150, img.height))  #left
                draw.path_vertical_line(1)
                draw.path_move(to=(img.width, img.height - 150))  #bottom
                draw.path_horizontal_line(1)
                draw.path_move(to=(img.width - 150, img.height))  #right
                draw.path_vertical_line(1)
                draw.path_close()
                draw.path_finish()
                draw.pop()

                # texts
                draw.push()
                draw.font_size = 32
                draw.text_alignment = 'center'
                draw.fill_color = Color('grey50')
                draw.font_family = "Arial"
                draw.text(int(img.width / 2), int(img.height - 100),
                          "2021 © LearnMYOG.com")
                draw.text(int(img.width - 100), int(img.height - 100),
                          str(pg_num))
                draw(img)
                draw.pop()

            # Convert to PDF
            with img.convert('pdf') as converted:
                fileout = os.path.splitext(i)[0] + '.pdf'
                converted.save(filename=fileout)
                print("Converted", i, "from", img.format, "to", fileout,
                      converted.format)

        pg_num += 1
Example #5
0
def rotate(img, max_deg=360):
    deg = int(np.random.uniform(0, max_deg))
    img.rotate(deg, background=Color('rgb(132,132,132)'))
    l, L = img.size
    img.crop(width=int(0.6 * l), height=int(0.6 * L), gravity='center')
Example #6
0
def find_names(filename, rot_fl=0, blur=0):
    # load the example image and convert it to grayscale

    req_image = []
    conv_img_list = []
    gray_list = []
    search_terms = []
    doc_text = ''

    with Image(filename=filename, resolution=300) as image_jpeg:
        image_jpeg.compression_quality = 99
        image_jpeg = image_jpeg.convert('jpeg')

        for img in image_jpeg.sequence:
            with Image(image=img) as img_page:
                img_page.background_color = Color('white')
                img_page.alpha_channel = 'remove'
                req_image.append(img_page.make_blob('jpeg'))
    image_jpeg.destroy()

    for index, img in enumerate(req_image):
        # txt = pytesseract.image_to_string(PI.open(io.BytesIO(img)))
        conv_img = PI.open(io.BytesIO(img))
        conv_img = np.asarray(conv_img, dtype=np.uint8)

        if len(conv_img.shape) == 3:
            #conv_img = cv2.cvtColor(conv_img, cv2.COLOR_RGB2BGR)

            gray = cv2.cvtColor(conv_img, cv2.COLOR_BGR2GRAY)
        else:
            gray = conv_img
        gray = preprocess_for_image(gray, blur)

        # Rotate images
        if rot_fl == 1:
            rot = rotation_spacing(gray)
        else:
            rot = 90.0

        rows, cols = gray.shape
        M = cv2.getRotationMatrix2D((cols / 2, rows / 2), 90 - rot, 1)
        gray = cv2.warpAffine(gray, M, (cols, rows))
        conv_img = cv2.warpAffine(conv_img, M, (cols, rows))
        #page_text = pytesseract.image_to_string(gray, config='--psm 12 --oem 1 -c textord_heavy_nr=1')
        #print(page_text)

        #doc_text = doc_text + page_text

        conv_img_list.append(conv_img)
        gray_list.append(gray)
        # cv2.imwrite(filename + str(index) + '.jpg', gray)

        # NLP analysis
        #nlp_result = ner_extraction(page_text)

        #nlp_result_sp = nlp_sp(page_text)
        #labels = set([w.label_ for w in nlp_result_sp.ents])
        #in_labels = ['PERSON', 'ORG', 'GPE', 'LOC', 'FAC']

        #others =[]

        #found_name = False
        #for sen in nlp_result["sentences"]:
        #    for tok in sen['tokens']:
        #        print('Stanford tok', tok)
        #        if tok['ner'] == 'PERSON' and not found_name and tok["word"] not in search_terms:
        #            print('Name:', tok["word"])
        #            search_terms.append(tok["word"])

        #print('after 1st', search_terms)

        #for tok in nlp_result_sp:
        #    print('spacy tok', tok.text, tok.lemma_, tok.pos_, tok.tag_)
        #    if tok.tag == 'PRP' and tok.text not in search_terms:
        #        print('PRP:', tok.text)

        #for ent in nlp_result_sp.ents:
        #    print('spacy ent', ent.text, ent.label_)

    print('SEARCH TERMS:', search_terms)

    return search_terms, conv_img_list, gray_list
Example #7
0
    def create_snapshot(self):
        logger.debug('document {pk:%s, mimetype:%s, type:%s} init snapshot' %
                     (self.pk, self.mimetype, self.type))

        if not self.attachment or not getattr(self.attachment, 'path', None):
            logger.debug('document {pk:%s} snapshot cannot be generated.' %
                         self.pk)
            return

        if not os.path.exists(self.attachment.path):
            logger.debug(
                'document {pk:%s} snapshot cannot be generated, attached file does not exist.'
                % self.pk)
            return

        # reconsider mimetype
        mimetype, encoding = mimetypes.guess_type(self.attachment.path,
                                                  strict=True)
        if mimetype:
            self.mimetype = mimetype

        logger.debug(
            'document {pk:%s, mimetype:%s, type:%s} snapshot can be generated'
            % (self.pk, self.mimetype, self.type))

        filename = '%s.snapshot.png' % self.short_url
        outfile = os.path.join(settings.MEDIA_ROOT,
                               snapshot_attachment_file_name(self, filename))

        # generate dir if there is none
        try:
            os.makedirs(os.path.dirname(outfile))
        except OSError:
            logger.debug(
                'document {pk:%s, mimetype:%s, type:%s} creating folder for snapshot'
                % (self.pk, self.mimetype, self.type))
            pass

        # generate thumbnail
        if self.mimetype.split(
                '/'
        )[0] == 'image' or self.type == Document.IMAGE or self.type == Document.PHOTO:
            logger.debug(
                'document {pk:%s, mimetype:%s, type:%s} generating IMAGE thumbnail...'
                % (self.pk, self.mimetype, self.type))

            # generate snapshot
            d = helpers.generate_snapshot(
                filename=self.attachment.path,
                output=outfile,
                width=settings.MILLER_SNAPSHOT_WIDTH,
                height=settings.MILLER_SNAPSHOT_HEIGHT)
            if d:
                self.data.update(d)

            self.snapshot = snapshot_attachment_file_name(
                self, filename
            )  #outfile# .save(os.path.basename(outfile), files.images.ImageFile(f), save=False)
            self._dirty = True
            logger.debug(
                'document {pk:%s, mimetype:%s, type:%s} IMAGE thumbnail done.'
                % (self.pk, self.mimetype, self.type))
            # remove tempfile

        # print mimetype
        elif self.mimetype == 'application/pdf':
            logger.debug(
                'document {pk:%s, mimetype:%s, type:%s} generating PDF snapshot...'
                % (self.pk, self.mimetype, self.type))

            pdffile = self.attachment.path
            pdf_im = PyPDF2.PdfFileReader(pdffile)

            # get page
            page = 0
            try:
                metadata = json.loads(self.contents)
                page = int(metadata['thumbnail_page']
                           ) if 'thumbnail_page' in metadata else 0
            except Exception as e:
                logger.exception(e)

            try:
                # Converting first page into JPG
                with Image(filename='%s[%s]' % (pdffile, page),
                           resolution=150) as img:
                    img.format = 'png'
                    img.background_color = Color(
                        'white')  # Set white background.
                    img.alpha_channel = 'remove'
                    img.save(filename=outfile)

                self.snapshot = snapshot_attachment_file_name(
                    self, filename
                )  #outfile# .save(os.path.basename(outfile), files.images.ImageFile(f), save=False)
                self._dirty = True

                # with open(self.attachment.path + '.png') as f:
                #   self.snapshot.save(os.path.basename(self.attachment.path)[:100] + '.png', files.images.ImageFile(f), save=False)
                #   self._dirty = True
                #   logger.debug('document {pk:%s, type:%s} PDF snapshot done.' % (self.pk,self.type))

            except Exception as e:
                logger.exception(e)
                print 'could not save snapshot of the required resource', self.pk
            else:
                logger.debug(
                    'snapshot generated for document {pk:%s}, page %s' %
                    (self.pk, page))
Example #8
0
def process_images(identifier,
                   downloaded_image_list,
                   post_gif,
                   use_wand=True,
                   use_imageio=True):
    """Convert/resize all images to png."""
    logger.info("Processing %d images." % len(downloaded_image_list))
    logger.debug(
        "process_images(): identifier = {}, downloaded_image_list = {},\
                  use_wand = {}, use_imageio = {}".format(
            identifier, downloaded_image_list, use_wand, use_imageio))
    image_list = []
    images_for_gif = []
    max_dim = [0, 0]
    new_image_format = "png"
    # also calculate average dimensions to scale down very large images
    dim_list_x = []
    dim_list_y = []

    # first loop to find maximum PDF dimensions to have high quality images
    for image_file in downloaded_image_list:
        if use_wand:
            # , resolution=300
            try:
                with Image(filename="{}[0]".format(image_file)) as img:
                    # process pdfs here only, others seem to be far too big
                    img.format = new_image_format
                    img.background_color = Color("white")
                    img.compression_quality = 85  # was 75
                    filename = image_file
                    img.alpha_channel = "remove"
                    img.trim(fuzz=0.01)
                    img.reset_coords()  # equivalent of repage
                    # give the file a different name
                    filesplit = image_file.rsplit(".", 1)
                    filename = filesplit[0] + "_." + filesplit[1]
                    if filename.endswith("pdf"):
                        filename = filename.replace(".pdf",
                                                    ".%s" % new_image_format)
                    # save image in list
                    image_list.append(filename)
                    img.save(filename=filename)
                    dim_list_x.append(img.size[0])
                    dim_list_y.append(img.size[1])
                    # need to save max dimensions for gif canvas
                    for i, _ in enumerate(max_dim):
                        if img.size[i] > max_dim[i]:
                            max_dim[i] = img.size[i]
            except CorruptImageError as corrupt_except:
                print(corrupt_except)
                print("Ignoring", image_file)
            except Exception as general_exception:  # pylint: disable=broad-except
                print(general_exception)
    # rescale images
    average_dims = (
        float(sum(dim_list_x)) / max(len(dim_list_x), 1),
        float(sum(dim_list_y)) / max(len(dim_list_y), 1),
    )
    dim_xy = int(
        max(min(MAX_IMG_DIM, average_dims[0]), min(MAX_IMG_DIM,
                                                   average_dims[0])))

    # print(max_dim[0], max_dim[1], dim_xy, MAX_IMG_DIM)
    # reset max_dim again
    max_dim = [0, 0]
    # scale individual images
    for image_file in image_list:
        if use_wand:
            filename = image_file
            with Image(filename=filename) as img:
                # print(filename, img.size[0], img.size[1])
                if (img.size[0] > dim_xy) or (img.size[1] > dim_xy):
                    scale_factor = dim_xy / float(max(img.size[0],
                                                      img.size[1]))
                    img.resize(int(img.size[0] * scale_factor),
                               int(img.size[1] * scale_factor))
                for i, _ in enumerate(max_dim):
                    if img.size[i] > max_dim[i]:
                        max_dim[i] = img.size[i]
                img.save(filename=filename)

    # bring list in order again
    image_list = sorted(image_list)
    if post_gif:
        # now we need another loop to create the gif canvas
        for image_file in image_list:
            with Image(filename=image_file) as foreground:
                foreground.format = "gif"
                image_file = image_file.replace(".%s" % new_image_format,
                                                ".gif")
                # foreground.transform(resize="{0}x{1}".format(*max_dim))
                add_margin = 1.03
                with Image(
                        width=int(max_dim[0] * add_margin),
                        height=int(max_dim[1] * add_margin),
                        background=Color("white"),
                ) as out:
                    left = int(
                        (max_dim[0] * add_margin - foreground.size[0]) / 2)
                    top = int(
                        (max_dim[1] * add_margin - foreground.size[1]) / 2)
                    out.composite(foreground, left=left, top=top)
                    out.save(filename=image_file)
            if use_imageio:
                images_for_gif.append(imageio.imread(image_file))
            else:
                images_for_gif.append(image_file)
        img_size = MAX_IMG_SIZE + 1
        # the gif can only have a certain size, so we loop until it's small enough
        while img_size > MAX_IMG_SIZE:
            if use_imageio:
                imageio.mimsave(
                    "{id}/{id}.gif".format(id=identifier),
                    images_for_gif,
                    format="GIF-FI",
                    duration=2,
                    quantizer="nq",
                    palettesize=256,
                )
            else:
                command = "convert -delay 200 -loop 0 "
                # command = "gifsicle --delay=120 --loop "
                command += " ".join(images_for_gif)
                command += " {id}/{id}.gif".format(id=identifier)
                # command += ' > {id}/{id}.gif'.format(id=identifier)
                execute_command(command)
            img_size = os.path.getsize("{id}/{id}.gif".format(id=identifier))
            if img_size > MAX_IMG_SIZE:
                images_for_gif = images_for_gif[:-1]
                logger.info(
                    "Image to big ({} bytes), dropping last figure, {} images in GIF"
                    .format(img_size, len(images_for_gif)))
                # os.remove('{id}/{id}.gif'.format(id=identifier))
            # replace image list by GIF only
        image_list = ["{id}/{id}.gif".format(id=identifier)]
    return image_list
Example #9
0
def find_names(filename, rot_fl=0, blur=0):
    # load the example image and convert it to grayscale

    image_pdf = Image(filename=filename, resolution=300)

    image_jpeg = image_pdf.convert('jpeg')

    req_image = []
    conv_img_list = []
    gray_list = []
    search_terms = []
    search_terms_sp = []
    doc_text = ''
    fin_terms = [
        'Address', 'Administration', 'Age', 'Agree', 'Agreement', 'Allowance',
        'Analysis', 'Annual', 'Approx', 'Assurance', 'Authority',
        'Authorisation', 'Balanced', 'Bank', 'Benefit', 'Birth', 'Budget',
        'Business', 'Capita', 'Capital', 'Capitalised', 'Cash', 'Centre',
        'Charge', 'Choice', 'Civil', 'Commencement', 'Comparison',
        'Conclusion', 'Condition', 'Confident', 'Confidential', 'Confirmation',
        'Consumer', 'Contribution', 'Control', 'Critical', 'Customs', 'Data',
        'Date', 'Death', 'Deed', 'Definition', 'Department', 'Detail',
        'Direct', 'Disagree', 'Discretionary', 'Discuss', 'Employment',
        'Emerging', 'Entitlement', 'Equity', 'European', 'Fact', 'FAQ',
        'Feature', 'Fee', 'File', 'Final', 'Financial', 'Flexibility',
        'Forename', 'Free', 'Full', 'Fund', 'General', 'Government', 'Growth',
        'Guide', 'Health', 'Income', 'Increase', 'Identified', 'Index',
        'Industry', 'Information', 'Insignificant', 'Insurance', 'Interest',
        'International', 'Investment', 'Investor', 'Legal', 'Life', 'Lifetime',
        'Limited', 'Lower', 'Lump', 'Marital', 'Member', 'Membership',
        'Mobile', 'Money', 'Mutual', 'National', 'Nominated', 'Normal', 'Note',
        'Number', 'Offer', 'Office', 'Ongoing', 'Option', 'Outcome',
        'Partnership', 'Paying', 'Pension', 'Percentage', 'Period', 'Personal',
        'Phone', 'Please', 'Portfolio', 'Post', 'Price', 'Profile',
        'Protection', 'Purchase', 'Rate', 'Reason', 'Recommendation', 'Reduce',
        'Reduction', 'Reference', 'Register', 'Registered', 'Regulation',
        'Regulator', 'Report', 'Research', 'Request', 'Result', 'Retail',
        'Retirement', 'Revenue', 'Risk', 'Salary', 'Saving', 'Scheme',
        'Section', 'Service', 'Solution', 'Spouse', 'Stakeholder', 'State',
        'Statement', 'Statistics', 'Status', 'Subject', 'Sum', 'Summary',
        'Support', 'Surname', 'Tax', 'Taxation', 'Tel', 'Telephone', 'Total',
        'Transfer', 'Trust', 'Trustee', 'Type', 'Typical', 'Typically',
        'Unauthorised', 'Unit', 'Value', 'Version', 'Wealth', 'Yield', 'Your',
        'Yours'
    ]

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        img_page.background_color = Color('white')
        img_page.alpha_channel = 'remove'
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:
        # txt = pytesseract.image_to_string(PI.open(io.BytesIO(img)))
        conv_img = PI.open(io.BytesIO(img))
        conv_img = np.asarray(conv_img, dtype=np.uint8)

        if len(conv_img.shape) == 3:
            #conv_img = cv2.cvtColor(conv_img, cv2.COLOR_RGB2BGR)

            gray = cv2.cvtColor(conv_img, cv2.COLOR_BGR2GRAY)
        else:
            gray = conv_img
        gray = preprocess_for_image(gray, blur)

        # Rotate images
        if rot_fl == 1:
            rot = rotation_spacing(gray)
        else:
            rot = 90.0

        rows, cols = gray.shape
        M = cv2.getRotationMatrix2D((cols / 2, rows / 2), 90 - rot, 1)
        gray = cv2.warpAffine(gray, M, (cols, rows))
        conv_img = cv2.warpAffine(conv_img, M, (cols, rows))
        page_text = pytesseract.image_to_string(
            gray, config='--psm 4 -c textord_heavy_nr=1')
        print(page_text)

        doc_text = doc_text + page_text

        conv_img_list.append(conv_img)
        gray_list.append(gray)

        # NLP analysis
        nlp_result = ner_extraction(page_text)

        nlp_result_sp = nlp_sp(page_text)
        labels = set([w.label_ for w in nlp_result_sp.ents])
        in_labels = ['PERSON', 'ORG', 'GPE', 'LOC', 'FAC']

        others = []

        for sen in nlp_result["sentences"]:
            for tok in sen['tokens']:
                #print(tok)
                if tok['ner'] == 'PERSON' or tok['ner'] == 'LOCATION' or tok[
                        'ner'] == 'ORGANIZATION' or tok['ner'] == 'MISC':
                    if tok["word"] not in search_terms and len(tok["word"]) > 1 and tok["word"] not in fin_terms \
                            and not tok["word"].islower():
                        search_terms.append(tok["word"])

                if tok['ner'] == 'O':
                    others.append(tok["word"])
                # Find emails, NINs and phone numbers
                if templates(tok["word"]):
                    search_terms.append(tok["word"])

        for label in labels:
            if label in in_labels:
                entities = [
                    cleanup(e.string, lower=False) for e in nlp_result_sp.ents
                    if label == e.label_
                ]
                entities = list(set(entities))
                #print(label, entities)

                for ent in entities:
                    wds_list = re.split(' |\n', ent)
                    for wd in wds_list:
                        if wd not in search_terms and wd not in search_terms_sp and len(
                                wd) > 1 and wd in others and not wd.islower():
                            search_terms_sp.append(wd)

    search_terms1 = []

    for term in search_terms_sp:
        # and term.lower() not in doc_text
        if term not in fin_terms and term[:-1] not in fin_terms:

            search_terms1.append(term)
        else:
            if templates(term):
                search_terms1.append(term)

    #tel = re.search("^(\+44\s?\d{4}|\(?0\d{4}\)?)\s?\d{3}\s?\d{3}$", doc_text)
    #print(tel)

    search_terms = search_terms + search_terms1
    print(search_terms)
    #Save search terms
    #text_file = open(filename[:-4] + 'terms.txt', "w")
    #text_file.write("%s" % search_terms1)
    #text_file.close()

    return search_terms, conv_img_list, gray_list
Example #10
0
 print(base)
 new = '{}.txt'.format(base)
 
 start_time = time.clock()
 
 # Open the file and read the pdf
 with open(file[1],'rb') as pdfFileObj, open(processed_dir + '/' + new, 'w', encoding='utf-8') as text_file:
     pdfFile = wi(filename = file[1], resolution = 300)
     image = pdfFile.convert('jpeg')
     #image.alpha_channel = 'remove'
     
     imageBlobs = []
     
     for img in image.sequence:
         imgPage = wi(image = img)
         imgPage.background_color = Color("white")
         imgPage.alpha_channel = 'remove'
         imageBlobs.append(imgPage.make_blob('jpeg'))
     
     extract = []
     
     pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
     
     for imgBlob in imageBlobs:
     	image = Image.open(io.BytesIO(imgBlob))
     	text = pytesseract.image_to_string(image, lang = 'eng')
     	extract.append(text)
     for item in extract:
         text_file.write("%s\n" % item)
     #text_file.write(text)
 
Example #11
0
    def __init__(self, src_dir, out_dir):
        self.src = src_dir
        self.out = out_dir

        self.FONT = Font(path="{}/font.ttf".format(src_dir),
                         color=Color("#ffffff"))
Example #12
0
from wand.image import Image, Color
from PyPDF2 import PdfFileReader, PdfFileWriter
import time
import tempfile

import io

pdf = io.BytesIO()
with open("./pdf_files/large.pdf", 'rb') as infile:
    reader = PdfFileReader(infile)

    writer = PdfFileWriter()

    for page in range(10):
        writer.addPage(reader.getPage(page))
    writer.write(pdf)

    pdf.seek(0)

    with (Image(file=pdf, resolution=120)) as source:
        images = source.sequence
        pages = len(images)
        for i in range(pages):
            n = i + 1
            newfilename = "./img_files/pdf_page_" + str(n) + '.png'
            with Image(images[i]) as img:
                img.format = 'png'
                img.background_color = Color('white')  # Set white background.
                img.alpha_channel = 'remove'
                img.save(filename=newfilename)
Example #13
0
                    '--input_dir',
                    help='path to directory that contains single-page PDFs')
args = parser.parse_args()

if __name__ == '__main__':

    # This list collects all the error strings
    total_errors = []

    # Get the list of files in the input directory
    input_dir_files = [
        f for f in listdir(args.input_dir) if isfile(join(args.input_dir, f))
    ]

    # Get the list of files in the output directory
    output_dir_files = [
        f for f in listdir(args.output_dir) if isfile(join(args.output_dir, f))
    ]

    for pdf in input_dir_files:
        if pdf[-4:] == '.pdf':
            pdf_base_name = pdf[:-4] + '.png'
            if pdf_base_name not in output_dir_files:
                print pdf_base_name
                # Converting single page into JPG
                with Image(filename=join(args.input_dir, pdf),
                           resolution=300) as img:
                    img.format = 'png'
                    img.background_color = Color('white')
                    img.alpha_channel = 'remove'
                    img.save(filename=join(args.output_dir, pdf[:-4] + '.png'))
Example #14
0
def ProcessRecord(r, pdfDense=None):
    if not pdfDense:
        print(
            "ERROR: pdfDense must be passed so that it is local to a multiprocessing thread."
        )

    ## Note the destination directories must already exist!

    ## If the image exists, assume it is good and skip it
    if (os.path.isfile(destImg(r, base="lowres"))) and (os.path.isfile(
            destImg(r, base="highres"))):
        return (None)

    ## Work in a temporary directory that gets automagically deleted upon completion
    with tempfile.TemporaryDirectory() as path:

        ## path="/dev/shm"
        srcfile = gdal.Open(srcImg(r))

        if DEBUG:
            print(srcfile)

        ## Check if the source file has a gdal projection
        noproj = gdal.Info(srcfile)
        noproj = (noproj.find("PROJCRS") < 0)

        ## Deal with hotspots and airport diagrams etc.  This is everything but plates
        ## if (recordType(r) in ["APD", "DAU", "DP", "HOT", "LAH", "ODP", "STAR"] or noproj):
        if (noproj):
            if DEBUG:
                print("No projection")

            if recordType(
                    r) == "APD":  ## Airport directories aren't trimmed for now
                trim = False
                pdfDense = 150  ## Temporary until such time as airport directory database can be updated.
            else:
                trim = True  # -trim +repage

            for p in ["lowres", "highres"]:
                thedense = pdfDense if p == "lowres" else pdfDense * 2
                writeImageNoWarp(srcImg(r),
                                 tmpDest(path, r, p),
                                 resolution=thedense,
                                 trim=trim)

        else:

            highDensityTmp = path + "/highDensityTmp.tif"

            ## Warp the image to high density. Do at least at 2x because high res is at 2x
            commstr = "gdalwarp -r %s -q -dstalpha --config GDAL_PDF_DPI %s -t_srs EPSG:3857 %s %s" % (
                resampling, pdfDense * 2, srcImg(r), highDensityTmp)
            if (os.system(commstr)):
                print("Failed warping didn't work")
                print(commstr)

            #ds = gdal.Warp(highDensityTmp, srcfile, dstSRS='EPSG:3857',
            #               height=str(size), dstAlpha=True, format="GTiff") # "lanczos", "cubicspline" , resampleAlg="nearestneighbor"
            #ds = None ## This is needed to ensure it is written
            ## ds = gdal.Translate(tmpfile2,gdal.Open(highDensityTmp),resampleAlg="lanczos", srcWin=getTrims(highDensityTmp))

            for p in ["lowres", "highres"]:
                thedest = tmpDest(path, r, p)
                thesize = size if p == "lowres" else size * 2
                ds = gdal.Translate(thedest,
                                    gdal.Open(highDensityTmp),
                                    resampleAlg=resampling,
                                    srcWin=getTrims(highDensityTmp),
                                    height=thesize,
                                    width=0,
                                    scaleParams=[[]])
                ds = None

                ## Get the corner strings for the geotag
                cornerstr = getTagCoordinates(thedest)

                ## Write the png image
                with Image(filename=thedest) as img:

                    #img.sharpen(radius=5.0,sigma=5.0)
                    img.background_color = Color(
                        'white')  # Set white background.
                    img.alpha_channel = 'remove'
                    extension = "png8"
                    img.format = extension
                    img.normalize()
                    img.quantize(16, dither=False)
                    # img.type = 'palette'
                    img.save(filename=thedest + "." + extension)

                    ## Write avare geotag into file.  Suppress the warning
                    if (extension == "png8"):
                        commstr = ("mv %s %s" %
                                   (thedest + "." + extension, thedest))
                    else:
                        commstr = "echo -n"
                    commstr += ' && optipng -quiet %s' % (thedest)
                    commstr += ' && exiftool -overwrite_original_in_place -q -Comment="%s" %s 2> /dev/null ' % (
                        cornerstr, thedest)
                    commstr += ' && exiv2 -M"set Exif.Photo.UserComment charset=Ascii %s" %s' % (
                        cornerstr, thedest)
                    ## commstr+=' && identify %s' % (thedest)
                    commstr += ' && cwebp -quiet -lossless -z 9 -metadata exif %s -o %s' % (
                        thedest, thedest.replace(".png", ".webp"))
                    ## commstr+=' && identify %s' % (thedest)
                    if (os.system(commstr)):
                        print("Failed at exif writing %s %s %s %s" % (r))
                    if DEBUG:
                        print(commstr)

        ## Finally move the resulting file(*) into place
        for p in ["lowres", "highres"]:
            commstr = "mv %s %s" % (tmpDest(path, r,
                                            p), destDir(r, base=p) + "/")
            commstr += " && mv %s %s" % (tmpDest(path, r, p).replace(
                ".png", ".webp"), destDir(r, base=p) + "/")
            if os.system(commstr):
                print("Move failed for " + path + runwayID(r) + "*.png")
                if DEBUG:
                    print(commstr)