def test_workflow_submission(indico, airlines_dataset, airlines_model_group: ModelGroup, _input): client = IndicoClient() wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id])) wf = max(wfs, key=lambda w: w.id) submission_ids = client.call( WorkflowSubmission(workflow_id=wf.id, **_input)) submission_id = submission_ids[0] assert submission_id is not None with pytest.raises(IndicoInputError): client.call(SubmissionResult(submission_id, "FAILED")) with pytest.raises(IndicoInputError): client.call(SubmissionResult(submission_id, "INVALID_STATUS")) result_url = client.call( SubmissionResult(submission_id, "COMPLETE", wait=True)) result = client.call(RetrieveStorageObject(result_url.result)) assert isinstance(result, dict) assert result["submission_id"] == submission_id assert result["file_version"] == 1 client.call(UpdateSubmission(submission_id, retrieved=True)) sub = client.call(GetSubmission(submission_id)) assert isinstance(sub, Submission) assert sub.retrieved is True
def pdf_extraction_call(pdf_filepath, client, config): """ Given a filepath, run Indico document extraction and save json output to dst_folder Arguments: pdf_filepath {str} -- path to Brochure pdf file client {IndicoClient} -- IndicoClient object containing auth details config {dict} -- Indico Document extraction options Returns: dict -- pdf extraction of pdf_filepath """ jobs = client.call( DocumentExtraction(files=[pdf_filepath], json_config=json.dumps(config))) for i, j in enumerate(jobs): try: job = client.call(JobStatus(id=j.id, wait=True)) doc_extract = client.call(RetrieveStorageObject(job.result)) except Exception as e: print(e) print(job.result) return None return doc_extract
def test_workflow_submission_auto_review(indico, force_complete, org_annotate_dataset, org_annotate_model_group): client = IndicoClient() wfs = client.call(ListWorkflows(dataset_ids=[org_annotate_dataset.id])) wf = max(wfs, key=lambda w: w.id) wf = client.call( UpdateWorkflowSettings(wf, enable_review=True, enable_auto_review=True)) assert wf.review_enabled and wf.auto_review_enabled _file = str(Path(__file__).parents[1]) + "/data/org-sample.pdf" sub_ids = client.call(WorkflowSubmission(workflow_id=wf.id, files=[_file])) subs = client.call(WaitForSubmissions(sub_ids, timeout=120)) sub = subs[0] assert sub.status == "PENDING_AUTO_REVIEW" raw_result = client.call(RetrieveStorageObject(sub.result_file)) changes = raw_result["results"]["document"]["results"] for model, preds in changes.items(): if isinstance(preds, dict): preds["accepted"] = True elif isinstance(preds, list): for pred in preds: pred["accepted"] = True job = client.call( SubmitReview(sub.id, changes=changes, force_complete=force_complete)) job = client.call(JobStatus(job.id)) submission = client.call(GetSubmission(sub.id)) assert submission.status == "COMPLETE" if force_complete else "PENDING_REVIEW"
def test_csv_changelog(indico): client = IndicoClient() changelogs = client.call( (GenerateChangelogReport(start_date=datetime.now(), end_date=datetime.now()))) assert changelogs is not None job = changelogs.job_id assert job is not None job = client.call(JobStatus(id=job, wait=True)) assert job.status == "SUCCESS" assert job.ready is True result = client.call(RetrieveStorageObject(job.result)) assert result is not None
def test_document_extraction_thumbnails(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call(DocumentExtraction(files=[dataset_filepath])) assert len(jobs) == 1 job = jobs[0] assert job.id is not None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready is True assert type(job.result["url"]) == str extract = client.call(RetrieveStorageObject(job.result)) assert type(extract) == dict assert "pages" in extract image = extract["pages"][0]["image"] image = client.call(RetrieveStorageObject(image)) assert image
def pdf_extraction(pdf_filepaths, client, config): """ get pdf extraction dictionary objects """ pdf_extractions = [] failed_files = [] jobs = client.call( DocumentExtraction(files=pdf_filepaths, json_config=json.dumps(config))) for i, j in enumerate(jobs): try: job = client.call(JobStatus(id=j.id, wait=True)) doc_extract = client.call(RetrieveStorageObject(job.result)) pdf_extractions.append(doc_extract) except: failed_files.append(pdf_filepaths[i]) return pdf_extractions, failed_files
def test_document_extraction_with_string_config(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call( DocumentExtraction( files=[dataset_filepath], json_config='{"preset_config": "simple"}' ) ) assert len(jobs) == 1 job = jobs[0] assert job.id is not None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready is True assert type(job.result["url"]) == str extract = client.call(RetrieveStorageObject(job.result)) assert type(extract) == dict assert "pages" in extract
def test_workflow_job(indico, airlines_dataset, airlines_model_group: ModelGroup): client = IndicoClient() wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id])) wf = max(wfs, key=lambda w: w.id) dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call( WorkflowSubmission(workflow_id=wf.id, files=[dataset_filepath], submission=False)) job = jobs[0] assert job.id is not None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready is True assert isinstance(job.result["url"], str) result = client.call(RetrieveStorageObject(job.result)) assert isinstance(result, dict)
def test_workflow_submission_versioned(indico, airlines_dataset, airlines_model_group: ModelGroup, _input): client = IndicoClient() wfs = client.call(ListWorkflows(dataset_ids=[airlines_dataset.id])) wf = max(wfs, key=lambda w: w.id) submission_ids = client.call( WorkflowSubmission(workflow_id=wf.id, result_version="LATEST", **_input)) assert len(submission_ids) == len(next(iter(_input.values()))) submission_id = submission_ids[0] assert submission_id is not None submissions = client.call(WaitForSubmissions(submission_id)) result = client.call(RetrieveStorageObject(submissions[0].result_file)) assert isinstance(result, dict) assert result["file_version"] == 2 assert len(result["submission_results"]) == 1 assert result["submission_results"][0]["input_filename"] == "mock.pdf"
block (or paragraph) level. """ from indico import IndicoClient, IndicoConfig from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject # Get the OCR object my_config = IndicoConfig(host="app.indico.io", api_token_path=".path/to/indico_api_token.txt") client = IndicoClient(config=my_config) files_to_extract = client.call( DocumentExtraction(files=["./test_paragraphs.pdf"], json_config={"preset_config": "standard"})) extracted_file = client.call(JobStatus(id=files_to_extract[0].id, wait=True)) json_result = client.call(RetrieveStorageObject(extracted_file.result)) # The code below shows how to get the OCR text from the 'json_result' object. # Note: it may vary slightly if you use DocumentExtraction configurations other than 'standard' # Full Text full_document_text = json_result["text"] # Doucment Text split by page text_by_page = list() for page in json_result["pages"]: text_by_page.append(page["text"]) # Document Text split by block (or paragraph) text_by_block = list() for page in json_result["pages"]:
# Use your dataset's id to call it's associated workflow dataset_id = 6826 my_config = IndicoConfig( host="app.indico.io", api_token_path="./path/to/indico_api_token.txt" ) client = IndicoClient(config=my_config) # Return a list of workflows for this dataset id or an empty list if there are none workflows = client.call(ListWorkflows(dataset_ids=[dataset_id])) if workflows: # Send a document through the workflow # Get back one Job per file jobs = client.call( WorkflowSubmission( workflow_id=workflows[0].id, files=["./path/to/sample.pdf"], submission=False, ) ) job = jobs[0] # Retrieve and print your result status = client.call(JobStatus(id=job.id, wait=True)) wf_result = client.call(RetrieveStorageObject(status.result)) print(wf_result) else: print("You don't have any workflows for this dataset")
client = IndicoClient(config=my_config) workflow_id = 5 """ Example 1 Create a new submission Generate a submission result as soon as the submission is done processing Then mark the submission has having been retrieved """ submission_ids = client.call( WorkflowSubmission(workflow_id=workflow_id, files=["./path_to_doc.pdf"])) submission_id = submission_ids[0] result_url = client.call(SubmissionResult(submission_id, wait=True)) result = client.call(RetrieveStorageObject(result_url.result)) print(result) client.call(UpdateSubmission(submission_id, retrieved=True)) """ Example 2 List all submissions that are COMPLETE or FAILED Generate submission results for these Delay gathering the results until required """ sub_filter = or_(SubmissionFilter(status="COMPLETE"), SubmissionFilter(status="FAILED")) submissions = client.call(ListSubmissions(filters=sub_filter)) result_files = { submission: client.call(GenerateSubmissionResult(submission))
for snapshot in client.paginate( GetUserSnapshots(date=datetime.now(), filters=filter_opts)): snapshots.extend(snapshot) print("Fetched just " + str(len(snapshots)) + " user for analysis") """ Example 4: Fetching a UserChangeLogs by API Pull in a limited set of user change data using the graph QL API """ # This is useful if you want only a limited selection of the changelogs changelogs = [] for log in client.paginate((GetUserChangelog(start_date=datetime.now(), end_date=datetime.now(), limit=100))): changelogs.extend(log) print("Fetched " + str(len(changelogs)) + " changes for the day") """ Example 5: Fetching longer User Change Logs as CSV Use the GenerateChangelogReport to get a longer changelog as CSV (or json) """ # Set the start date and end date start_date = datetime.today() - timedelta(days=7) changelogs = client.call((GenerateChangelogReport(start_date=start_date, end_date=datetime.now()))) # This generates a job which can be waited for job_id = changelogs.job_id job = client.call(JobStatus(id=job_id, wait=True)) # And the job will contain a storage object file with the full report. result = client.call(RetrieveStorageObject(job.result)) print(result)