def main(directory, file_name,out_dir, file_type='.pdf', dpi=500, verbose=True):
    """
    Walks through all the pages in a file, converts to image, and uses OCR to 
    convert image to .txt files.  It then deletes the image files and merges the
    individual page .txt files.  After it deletes the individual page .txt files

    Params:
    directory: str where the files are stored
    file_name: str name of file
    out_dir: str the directory where the text files are going
    file_type:str default '.pdf'
    dpi: int dots per inch
    verbose: bool default is True
    """
    
    da = DirectoryAssistor()
    out_dir = out_dir
    image_list = convert_to_img(directory=directory, file_name=file_name)
    page_lst = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
        for img_path,out_file in zip(image_list,executor.map(ocr,image_list)):
            print(img_path.split("\\")[-1],',',out_file,', processed')
            page_lst.append(out_file)
    
    for img in image_list:
            da.delete_files(img)

    merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name)
    
    for page in page_lst:
        da.delete_files(out_dir+page)
Esempio n. 2
0
class ImageConverter():
    def __init__(self, directory):
        self.directory = directory
        self.ds = DirectoryAssistor()

    def convert_image(self, file_name, file_type='.pdf', dpi=500):
        '''
        Converts a file of type file_type to .txt using OCR

        Params
        file_name: string that is the name of the file that needs to be read in
        file_type: str that is the type of file being read in
        dpi: int that is the dots per inch of the file being read in

        '''
        pages = convert_from_path(self.directory + file_name, dpi=dpi)
        image_counter = 1
        image_names = []

        for page in pages:
            image_name = 'page_' + str(image_counter) + '.jpg'
            image_names.append(image_name)
            page.save(image_name, 'JPEG')
            image_counter += 1

        new_file_name = file_name.replace(file_type, '.txt')
        filelimit = image_counter - 1
        outfile = self.directory + new_file_name

        f = open(outfile, 'a')
        for i in range(1, filelimit + 1):
            image_name = "page_" + str(i) + ".jpg"
            text = str(((pytesseract.image_to_string(Image.open(image_name)))))
            text = text.replace('-\n', '')
            f.write(text)
        f.close()

        for img in image_names:
            self.ds.delete_files(img)
        self.ds.delete_files(directory + file_name)

    def convert_txt_to_doc(self, text_file):
        '''
        Converts a .txt document to a .doc format

        Params
        text_file: name of .txt file stored in the directory for the object

        '''

        document = Document()
        new_name = text_file.replace('.txt', '')
        document.add_heading(new_name, 0)

        myfile = open(self.directory + text_file).read()
        myfile = re.sub(r'[^\x00-\x7F]+|\x0c', ' ', myfile)
        p = document.add_paragraph(myfile)
        document.save(directory + new_name + '.doc')