def test_workflow_submission(indico, airlines_dataset,
                             airlines_model_group: ModelGroup, _input):
    client = IndicoClient()
    wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id]))
    wf = max(wfs, key=lambda w: w.id)

    submission_ids = client.call(
        WorkflowSubmission(workflow_id=wf.id, **_input))
    submission_id = submission_ids[0]
    assert submission_id is not None

    with pytest.raises(IndicoInputError):
        client.call(SubmissionResult(submission_id, "FAILED"))

    with pytest.raises(IndicoInputError):
        client.call(SubmissionResult(submission_id, "INVALID_STATUS"))

    result_url = client.call(
        SubmissionResult(submission_id, "COMPLETE", wait=True))
    result = client.call(RetrieveStorageObject(result_url.result))
    assert isinstance(result, dict)
    assert result["submission_id"] == submission_id
    assert result["file_version"] == 1
    client.call(UpdateSubmission(submission_id, retrieved=True))
    sub = client.call(GetSubmission(submission_id))
    assert isinstance(sub, Submission)
    assert sub.retrieved is True
Beispiel #2
0
def pdf_extraction_call(pdf_filepath, client, config):
    """
    Given a filepath, run Indico document extraction and save json output to
    dst_folder

    Arguments:
        pdf_filepath {str} -- path to Brochure pdf file
        client {IndicoClient} -- IndicoClient object containing auth details
        config {dict} -- Indico Document extraction options

    Returns:
        dict -- pdf extraction of pdf_filepath

    """

    jobs = client.call(
        DocumentExtraction(files=[pdf_filepath],
                           json_config=json.dumps(config)))

    for i, j in enumerate(jobs):
        try:
            job = client.call(JobStatus(id=j.id, wait=True))
            doc_extract = client.call(RetrieveStorageObject(job.result))
        except Exception as e:
            print(e)
            print(job.result)
            return None
    return doc_extract
def test_workflow_submission_auto_review(indico, force_complete,
                                         org_annotate_dataset,
                                         org_annotate_model_group):
    client = IndicoClient()
    wfs = client.call(ListWorkflows(dataset_ids=[org_annotate_dataset.id]))
    wf = max(wfs, key=lambda w: w.id)
    wf = client.call(
        UpdateWorkflowSettings(wf, enable_review=True,
                               enable_auto_review=True))
    assert wf.review_enabled and wf.auto_review_enabled

    _file = str(Path(__file__).parents[1]) + "/data/org-sample.pdf"

    sub_ids = client.call(WorkflowSubmission(workflow_id=wf.id, files=[_file]))
    subs = client.call(WaitForSubmissions(sub_ids, timeout=120))
    sub = subs[0]
    assert sub.status == "PENDING_AUTO_REVIEW"
    raw_result = client.call(RetrieveStorageObject(sub.result_file))
    changes = raw_result["results"]["document"]["results"]
    for model, preds in changes.items():
        if isinstance(preds, dict):
            preds["accepted"] = True
        elif isinstance(preds, list):
            for pred in preds:
                pred["accepted"] = True
    job = client.call(
        SubmitReview(sub.id, changes=changes, force_complete=force_complete))
    job = client.call(JobStatus(job.id))
    submission = client.call(GetSubmission(sub.id))
    assert submission.status == "COMPLETE" if force_complete else "PENDING_REVIEW"
def test_csv_changelog(indico):
    client = IndicoClient()
    changelogs = client.call(
        (GenerateChangelogReport(start_date=datetime.now(),
                                 end_date=datetime.now())))
    assert changelogs is not None
    job = changelogs.job_id
    assert job is not None
    job = client.call(JobStatus(id=job, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    result = client.call(RetrieveStorageObject(job.result))
    assert result is not None
Beispiel #5
0
def test_document_extraction_thumbnails(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(DocumentExtraction(files=[dataset_filepath]))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id is not None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    assert type(job.result["url"]) == str

    extract = client.call(RetrieveStorageObject(job.result))

    assert type(extract) == dict
    assert "pages" in extract
    image = extract["pages"][0]["image"]

    image = client.call(RetrieveStorageObject(image))

    assert image
Beispiel #6
0
def pdf_extraction(pdf_filepaths, client, config):
    """
    get pdf extraction dictionary objects
    """
    pdf_extractions = []
    failed_files = []
    jobs = client.call(
        DocumentExtraction(files=pdf_filepaths,
                           json_config=json.dumps(config)))

    for i, j in enumerate(jobs):
        try:
            job = client.call(JobStatus(id=j.id, wait=True))
            doc_extract = client.call(RetrieveStorageObject(job.result))
            pdf_extractions.append(doc_extract)
        except:
            failed_files.append(pdf_filepaths[i])

    return pdf_extractions, failed_files
Beispiel #7
0
def test_document_extraction_with_string_config(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        DocumentExtraction(
            files=[dataset_filepath], json_config='{"preset_config": "simple"}'
        )
    )

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id is not None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    assert type(job.result["url"]) == str

    extract = client.call(RetrieveStorageObject(job.result))
    assert type(extract) == dict
    assert "pages" in extract
def test_workflow_job(indico, airlines_dataset,
                      airlines_model_group: ModelGroup):
    client = IndicoClient()
    wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id]))
    wf = max(wfs, key=lambda w: w.id)

    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        WorkflowSubmission(workflow_id=wf.id,
                           files=[dataset_filepath],
                           submission=False))
    job = jobs[0]

    assert job.id is not None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    assert isinstance(job.result["url"], str)

    result = client.call(RetrieveStorageObject(job.result))

    assert isinstance(result, dict)
def test_workflow_submission_versioned(indico, airlines_dataset,
                                       airlines_model_group: ModelGroup,
                                       _input):
    client = IndicoClient()
    wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id]))
    wf = max(wfs, key=lambda w: w.id)

    submission_ids = client.call(
        WorkflowSubmission(workflow_id=wf.id,
                           result_version="LATEST",
                           **_input))

    assert len(submission_ids) == len(next(iter(_input.values())))
    submission_id = submission_ids[0]
    assert submission_id is not None

    submissions = client.call(WaitForSubmissions(submission_id))
    result = client.call(RetrieveStorageObject(submissions[0].result_file))

    assert isinstance(result, dict)
    assert result["file_version"] == 2
    assert len(result["submission_results"]) == 1
    assert result["submission_results"][0]["input_filename"] == "mock.pdf"
Beispiel #10
0
block (or paragraph) level.
"""

from indico import IndicoClient, IndicoConfig
from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject

# Get the OCR object
my_config = IndicoConfig(host="app.indico.io",
                         api_token_path=".path/to/indico_api_token.txt")
client = IndicoClient(config=my_config)

files_to_extract = client.call(
    DocumentExtraction(files=["./test_paragraphs.pdf"],
                       json_config={"preset_config": "standard"}))
extracted_file = client.call(JobStatus(id=files_to_extract[0].id, wait=True))
json_result = client.call(RetrieveStorageObject(extracted_file.result))

# The code below shows how to get the OCR text from the 'json_result' object.
# Note: it may vary slightly if you use DocumentExtraction configurations other than 'standard'

# Full Text
full_document_text = json_result["text"]

# Doucment Text split by page
text_by_page = list()
for page in json_result["pages"]:
    text_by_page.append(page["text"])

# Document Text split by block (or paragraph)
text_by_block = list()
for page in json_result["pages"]:
Beispiel #11
0
# Use your dataset's id to call it's associated workflow
dataset_id = 6826

my_config = IndicoConfig(
    host="app.indico.io", api_token_path="./path/to/indico_api_token.txt"
)
client = IndicoClient(config=my_config)

# Return a list of workflows for this dataset id or an empty list if there are none
workflows = client.call(ListWorkflows(dataset_ids=[dataset_id]))

if workflows:
    # Send a document through the workflow
    # Get back one Job per file
    jobs = client.call(
        WorkflowSubmission(
            workflow_id=workflows[0].id,
            files=["./path/to/sample.pdf"],
            submission=False,
        )
    )
    job = jobs[0]

    # Retrieve and print your result
    status = client.call(JobStatus(id=job.id, wait=True))
    wf_result = client.call(RetrieveStorageObject(status.result))
    print(wf_result)

else:
    print("You don't have any workflows for this dataset")
Beispiel #12
0
client = IndicoClient(config=my_config)

workflow_id = 5
"""
Example 1
Create a new submission
Generate a submission result as soon as the submission is done processing
Then mark the submission has having been retrieved
"""

submission_ids = client.call(
    WorkflowSubmission(workflow_id=workflow_id, files=["./path_to_doc.pdf"]))
submission_id = submission_ids[0]

result_url = client.call(SubmissionResult(submission_id, wait=True))
result = client.call(RetrieveStorageObject(result_url.result))
print(result)

client.call(UpdateSubmission(submission_id, retrieved=True))
"""
Example 2
List all submissions that are COMPLETE or FAILED
Generate submission results for these
Delay gathering the results until required
"""
sub_filter = or_(SubmissionFilter(status="COMPLETE"),
                 SubmissionFilter(status="FAILED"))
submissions = client.call(ListSubmissions(filters=sub_filter))

result_files = {
    submission: client.call(GenerateSubmissionResult(submission))
Beispiel #13
0
for snapshot in client.paginate(
        GetUserSnapshots(date=datetime.now(), filters=filter_opts)):
    snapshots.extend(snapshot)
print("Fetched just " + str(len(snapshots)) + " user for analysis")
"""

Example 4: Fetching a UserChangeLogs by API
Pull in a limited set of user change data using the graph QL API
"""
# This is useful if you want only a limited selection of the changelogs
changelogs = []
for log in client.paginate((GetUserChangelog(start_date=datetime.now(),
                                             end_date=datetime.now(),
                                             limit=100))):
    changelogs.extend(log)
print("Fetched " + str(len(changelogs)) + " changes for the day")
"""
Example 5: Fetching longer User Change Logs as CSV
Use the GenerateChangelogReport to get a longer changelog as CSV (or json)
"""
# Set the start date and end date
start_date = datetime.today() - timedelta(days=7)
changelogs = client.call((GenerateChangelogReport(start_date=start_date,
                                                  end_date=datetime.now())))
# This generates a job which can be waited for
job_id = changelogs.job_id
job = client.call(JobStatus(id=job_id, wait=True))
# And the job will contain a storage object file with the full report.
result = client.call(RetrieveStorageObject(job.result))
print(result)