def main(imagePath): # Create OCR ocr = Ocr() # Run OCR over image. It generates a JSON with the text and the coordinates of each word ocr.processFile(imagePath, './') # Read JSON and delete it jsonFile = changeFileExtension(imagePath.split("/")[-1], "json") with open(jsonFile, 'r') as f: image = json.load(f) # Extract tokens (Each word, its width, height and its coordinates) tokens = extractTokens(image) # Sort the tokens into lines lines = extractLines(tokens) txt = "" for line in lines: print(line) line = list(filter(lambda x: x != "–", line)) try: txt += "{:>40}{:>40}{:>40}{:>40}\n".format(line[0], line[1], line[2], line[3]) except: try: txt += "{:>40}{:>40}\n".format(line[0], line[1]) except: pass with open(changeFileExtension(imagePath.split("/")[-1], "txt"), 'w') as f: f.write(txt)
def convertImageToText(imagePath): # Create OCR ocr = Ocr() # Run OCR over image. It generates a JSON with the text and the coordinates of each word jsonFile = ocr.processFile(imagePath, './') # Read JSON # jsonFile = changeFileExtension(imagePath.split("/")[-1], "json") # with open(jsonFile, 'r') as f: # image = json.load(f) # Extract tokens (Each word, its width, height and its coordinates) #tokens = extractTokens(image) tokens = extractTokens(jsonFile) # Sort the tokens into lines lines = extractLines(tokens) txt = "" response = {} linesList = [] for line in lines: print(json.dumps(line)) linesList.append(line) line = list(filter(lambda x: x != "–", line)) try: txt += "{:>40}{:>40}{:>40}{:>40}\n".format(line[0], line[1], line[2], line[3]) except: try: txt += "{:>40}{:>40}\n".format(line[0], line[1]) except: pass # with open(changeFileExtension(imagePath.split("/")[-1], "txt"), 'w') as f: # f.write(txt) response["0"] = linesList return json.dumps(response, indent=4)
def main(imagePath): # Create OCR ocr = Ocr() # Run OCR over image. It generates a JSON with the text and the coordinates of each word ocr.processFile(imagePath, './') # Read JSON jsonFile = changeFileExtension(imagePath.split("/")[-1], "json") with open(jsonFile, 'r') as f: image = json.load(f) # Extract tokens (Each word, its width, height and its coordinates) tokens = extractTokens(image) # Sort the tokens into lines lines, x, y = extractLines(tokens) os.remove(jsonFile) output = [] #txt = "" num_row = 0 num_col = 0 for line in lines: output.append(line) num_row += 1 if (len(line) > num_col): num_col = len(line) dataframe, data = list_to_dataframe(output, y, x) writer = pd.ExcelWriter(os.path.splitext(imagePath)[0] + '.xlsx', engine='xlsxwriter', options={'strings_to_numbers': True}) data.to_excel(writer, index=False, header=None, sheet_name='Sheet1') workbook = writer.book worksheet = writer.sheets['Sheet1'] for column in dataframe: max_length = 0 adjustment = 1 for i in dataframe[column]: #get number of chinese and japanese characters/symbols in string east_asian_text_adj = len(extract_unicode_block(i)) #adjust column_length by the amount of normal eng chars + jpn/chn chars column_length = len(i) + east_asian_text_adj if column_length > max_length: max_length = column_length col_idx = dataframe.columns.get_loc(column) writer.sheets['Sheet1'].set_column(col_idx, col_idx, max_length) writer.save() dataframe.to_csv(os.path.splitext(imagePath)[0] + '.csv', index=False, header=False, encoding='utf-8-sig')