def test_job_timeout(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
    job = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "detailed"}'))[0]
    with pytest.raises(IndicoTimeoutError):
        job = client.call(JobStatus(id=job.id, wait=True, timeout=0.0))
def test_job_wait_on_failure(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "wrong"}'))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id != None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "FAILURE"
    assert type(job.result) == dict
def test_job_wait_on_success(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "simple"}'))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id != None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready == True
    assert type(job.result["url"]) == str
Ejemplo n.º 4
0
def test_document_extraction(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(DocumentExtraction(files=[dataset_filepath]))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id is not None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    assert type(job.result["url"]) == str

    extract = client.call(RetrieveStorageObject(job.result))

    assert type(extract) == dict
    assert "pages" in extract
Ejemplo n.º 5
0
def test_document_extraction_batched(indico):
    client = IndicoClient()
    file_names = ["mock.pdf", "mock_2.pdf", "mock_3.pdf"]
    parent_path = str(Path(__file__).parent.parent / "data")
    dataset_filepaths = [
        os.path.join(parent_path, file_name) for file_name in file_names
    ]

    jobs = client.call(
        DocumentExtraction(
            files=dataset_filepaths,
            json_config={"preset_config": "simple"},
            upload_batch_size=1,
        )
    )
    assert len(jobs) == 3
    for job in jobs:
        assert job.id is not None
        job = client.call(JobStatus(id=job.id, wait=True))
        assert job.status == "SUCCESS"
        assert job.ready is True
        assert isinstance(job.result["url"], str)
Ejemplo n.º 6
0
"""
Example demonstrating how to OCR a document and access the text at the document, page, and 
block (or paragraph) level.
"""

from indico import IndicoClient, IndicoConfig
from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject

# Get the OCR object
my_config = IndicoConfig(host="app.indico.io",
                         api_token_path=".path/to/indico_api_token.txt")
client = IndicoClient(config=my_config)

files_to_extract = client.call(
    DocumentExtraction(files=["./test_paragraphs.pdf"],
                       json_config={"preset_config": "standard"}))
extracted_file = client.call(JobStatus(id=files_to_extract[0].id, wait=True))
json_result = client.call(RetrieveStorageObject(extracted_file.result))

# The code below shows how to get the OCR text from the 'json_result' object.
# Note: it may vary slightly if you use DocumentExtraction configurations other than 'standard'

# Full Text
full_document_text = json_result["text"]

# Doucment Text split by page
text_by_page = list()
for page in json_result["pages"]:
    text_by_page.append(page["text"])

# Document Text split by block (or paragraph)
Ejemplo n.º 7
0
from indico import IndicoClient, IndicoConfig
from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject

# Create an Indico API client
my_config = IndicoConfig(host="app.indico.io",
                         api_token_path="./path/to/indico_api_token.txt")
client = IndicoClient(config=my_config)

# OCR a single file and wait for it to complete
job = client.call(
    DocumentExtraction(files=["./path_to_doc.pdf"],
                       json_config=dict(preset_config="ondocument")))
extracted_file = client.call(JobStatus(id=job[0].id, wait=True))

if extracted_file.status == "SUCCESS":
    result = client.call(RetrieveStorageObject(extracted_file.result))
    print(result)