def runTest(): OcrHelper.setMinTextLayer(8) print("text layer is: " + str(OcrHelper.getMinTextLayer())) pdf_path = "file:///" + os.getcwd( ) + "/../ocr/src/test/resources/pdfs/" data = OcrHelper.createDataset(spark=SparkContextForTest.spark, input_path=pdf_path, output_col="region", metadata_col="metadata") data.show() OcrHelper.setMinTextLayer(0) print("Text layer disabled") data = OcrHelper.createDataset(spark=SparkContextForTest.spark, input_path=pdf_path, output_col="region", metadata_col="metadata") data.show() OcrHelper.setMinTextLayer(10) content = OcrHelper.createMap( input_path="../ocr/src/test/resources/pdfs") print(content) document_assembler = DocumentAssembler() \ .setInputCol("region") \ .setOutputCol("document") document_assembler.transform(data).show()
def runTest(): data = OcrHelper.createDataset( spark=SparkContextForTest.spark, input_path="../ocr/src/test/resources/pdfs/", output_col="region", metadata_col="metadata") data.show() content = OcrHelper.createMap( input_path="../ocr/src/test/resources/pdfs/") print(content) document_assembler = DocumentAssembler() \ .setInputCol("region") \ .setOutputCol("document") document_assembler.transform(data).show()
def runTest(): OcrHelper.setPreferredMethod('text') print("text layer is: " + str(OcrHelper.getPreferredMethod())) pdf_path = "file:///" + os.getcwd() + "/../ocr/src/test/resources/pdfs/" data = OcrHelper.createDataset( spark=SparkContextForTest.spark, input_path=pdf_path) data.show() OcrHelper.setPreferredMethod('image') print("Text layer disabled") data = OcrHelper.createDataset( spark=SparkContextForTest.spark, input_path=pdf_path) data.show() OcrHelper.setPreferredMethod('text') content = OcrHelper.createMap(input_path="../ocr/src/test/resources/pdfs") print(content) document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") document_assembler.transform(data).show()