Example #1
0
def main(imagePath):
    # Create OCR
    ocr = Ocr()

    # Run OCR over image. It generates a JSON with the text and the coordinates of each word
    ocr.processFile(imagePath, './')

    # Read JSON and delete it
    jsonFile = changeFileExtension(imagePath.split("/")[-1], "json")
    with open(jsonFile, 'r') as f:
        image = json.load(f)

    # Extract tokens (Each word, its width, height and its coordinates)
    tokens = extractTokens(image)

    # Sort the tokens into lines
    lines = extractLines(tokens)

    txt = ""
    for line in lines:
        print(line)
        line = list(filter(lambda x: x != "–", line))
        try:
            txt += "{:>40}{:>40}{:>40}{:>40}\n".format(line[0], line[1],
                                                       line[2], line[3])
        except:
            try:
                txt += "{:>40}{:>40}\n".format(line[0], line[1])
            except:
                pass

    with open(changeFileExtension(imagePath.split("/")[-1], "txt"), 'w') as f:
        f.write(txt)
def convertImageToText(imagePath):

    # Create OCR
    ocr = Ocr()

    # Run OCR over image. It generates a JSON with the text and the coordinates of each word
    jsonFile = ocr.processFile(imagePath, './')

    # Read JSON
    # jsonFile = changeFileExtension(imagePath.split("/")[-1], "json")
    # with open(jsonFile, 'r') as f:
    #     image = json.load(f)

    # Extract tokens (Each word, its width, height and its coordinates)
    #tokens = extractTokens(image)
    tokens = extractTokens(jsonFile)

    # Sort the tokens into lines
    lines = extractLines(tokens)

    txt = ""
    response = {}
    linesList = []
    for line in lines:
        print(json.dumps(line))

        linesList.append(line)

        line = list(filter(lambda x: x != "–", line))
        try:
            txt += "{:>40}{:>40}{:>40}{:>40}\n".format(line[0], line[1],
                                                       line[2], line[3])
        except:
            try:
                txt += "{:>40}{:>40}\n".format(line[0], line[1])
            except:
                pass

    # with open(changeFileExtension(imagePath.split("/")[-1], "txt"), 'w') as f:
    #     f.write(txt)

    response["0"] = linesList
    return json.dumps(response, indent=4)
def main(imagePath):
    # Create OCR
    ocr = Ocr()

    # Run OCR over image. It generates a JSON with the text and the coordinates of each word
    ocr.processFile(imagePath, './')

    # Read JSON
    jsonFile = changeFileExtension(imagePath.split("/")[-1], "json")
    with open(jsonFile, 'r') as f:
        image = json.load(f)

    # Extract tokens (Each word, its width, height and its coordinates)
    tokens = extractTokens(image)

    # Sort the tokens into lines
    lines, x, y = extractLines(tokens)

    os.remove(jsonFile)

    output = []
    #txt = ""
    num_row = 0
    num_col = 0
    for line in lines:
        output.append(line)
        num_row += 1
        if (len(line) > num_col):
            num_col = len(line)

    dataframe, data = list_to_dataframe(output, y, x)

    writer = pd.ExcelWriter(os.path.splitext(imagePath)[0] + '.xlsx',
                            engine='xlsxwriter',
                            options={'strings_to_numbers': True})
    data.to_excel(writer, index=False, header=None, sheet_name='Sheet1')
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    for column in dataframe:
        max_length = 0
        adjustment = 1
        for i in dataframe[column]:

            #get number of chinese and japanese characters/symbols in string
            east_asian_text_adj = len(extract_unicode_block(i))

            #adjust column_length by the amount of normal eng chars + jpn/chn chars
            column_length = len(i) + east_asian_text_adj
            if column_length > max_length:
                max_length = column_length

        col_idx = dataframe.columns.get_loc(column)
        writer.sheets['Sheet1'].set_column(col_idx, col_idx, max_length)

    writer.save()

    dataframe.to_csv(os.path.splitext(imagePath)[0] + '.csv',
                     index=False,
                     header=False,
                     encoding='utf-8-sig')