def test_is_file(self): """Test if_file function with empty files,existing files and not existing files.""" open('new_empty_file.txt', 'a').close() self.assertEqual(is_file('new_empty_file', False), False) os.remove('new_empty_file.txt') self.assertEqual(is_file('not_existing_file.txt', False), False) self.assertEqual(is_file(TEST_PDF_FILE, False), True)
def get_templates(self, update=False): """ Get ID and name of any Template in the project. :param update: Update the downloaded information even it is already available :return: Templates in the project. """ if not self.templates or update: self.templates_file_path = os.path.join(self.data_root, 'templates.json5') if not is_file(self.templates_file_path, raise_exception=False) or update: templates_data = get_project_templates(session=self.session) if templates_data: # the text of a document can be None with open(self.templates_file_path, 'w') as f: json.dump(templates_data, f, indent=2, sort_keys=True) else: with open(self.templates_file_path, 'r') as f: templates_data = json.load(f) for template_data in templates_data: self.template_class(project=self, **template_data) # Make default_template an Template instance for template in self.templates: if isinstance(template.default_template, int): template.default_template = self.get_template_by_id( template.default_template) return self.templates
def test_project(self): """Test basic properties of the project object.""" assert is_file(self.prj.meta_file_path) assert self.prj.documents[1].id > self.prj.documents[0].id assert len(self.prj.documents) # check if we can initialize a new project object, which will use the same data assert len(self.prj.documents) == self.document_count new_project = Project() assert len(new_project.documents) == self.correct_document_count assert new_project.meta_file_path == self.prj.meta_file_path
def clean_meta(self): """Clean the meta-information about the Project, Labels, and Templates.""" if self.meta_file_path: os.remove(self.meta_file_path) assert not is_file(self.meta_file_path, raise_exception=False) self.meta_data = None self.meta_file_path = None if self.labels_file_path: os.remove(self.labels_file_path) assert not is_file(self.labels_file_path, raise_exception=False) self.labels_file_path = None self.labels: List[Label] = [] if self.templates_file_path: os.remove(self.templates_file_path) assert not is_file(self.templates_file_path, raise_exception=False) self.templates_file_path = None self.templates: List[Template] = []
def upload_file_konfuzio_api(filepath: str, project_id: int, session=konfuzio_session(), dataset_status: int = 0): """ Upload file to Konfuzio API. :param filepath: Path to file to be uploaded :param session: Session to connect to the server :param project_id: Project ID where to upload the document :return: Response status. """ url = get_upload_document_url() is_file(filepath) with open(filepath, "rb") as f: file_data = f.read() files = {"data_file": (os.path.basename(filepath), file_data, "multipart/form-data")} data = {"project": project_id, "dataset_status": dataset_status} r = session.post(url=url, files=files, data=data) return r
def get_images(self, update: bool = False): """ Get document pages as png images. :param update: Update the downloaded images even they are already available :return: Path to OCR file. """ session = konfuzio_session() self.image_paths = [] for page in self.pages: if is_file(page['image'], raise_exception=False): self.image_paths.append(page['image']) else: page_path = os.path.join(self.root, f'page_{page["number"]}.png') self.image_paths.append(page_path) if not is_file(page_path, raise_exception=False) or update: url = f'{KONFUZIO_HOST}{page["image"]}' res = retry_get(session, url) with open(page_path, 'wb') as f: f.write(res.content)
def get_file(self, update: bool = False): """ Get OCR version of the original file. :param update: Update the downloaded file even if it is already available :return: Path to OCR file. """ if self.is_without_errors and (not self.ocr_file_path or update): for page_index in range(0, self.number_of_pages): self.ocr_file_path = os.path.join(self.root, 'ocr.pdf') if not is_file(self.ocr_file_path, raise_exception=False) or update: pdf_content = download_file_konfuzio_api( self.id, session=self.session) with open(self.ocr_file_path, 'wb') as f: f.write(pdf_content) return self.ocr_file_path
def get_meta(self, update=False): """ Get the list of all documents in the project and their information. :param update: Update the downloaded information even it is already available :return: Information of the documents in the project. """ if not self.meta_data or update: self.meta_file_path = os.path.join(self.data_root, 'meta.json5') if not is_file(self.meta_file_path, raise_exception=False): self.meta_data = get_meta_of_files(self.session) with open(self.meta_file_path, 'w') as f: json.dump(self.meta_data, f, indent=2, sort_keys=True) else: with open(self.meta_file_path, 'r') as f: self.meta_data = json.load(f) return self.meta_data
def get_labels(self, update=False): """ Get ID and name of any label in the project. :param update: Update the downloaded information even it is already available :return: Labels in the project. """ if not self.labels or update: self.labels_file_path = os.path.join(self.data_root, 'labels.json5') if not is_file(self.labels_file_path, raise_exception=False) or update: labels_data = get_project_labels(session=self.session) with open(self.labels_file_path, 'w') as f: json.dump(labels_data, f, indent=2, sort_keys=True) else: with open(self.labels_file_path, 'r') as f: labels_data = json.load(f) for label_data in labels_data: # Remove the project from label_data as we use the already present project reference. label_data.pop('project', None) self.label_class(project=self, **label_data) return self.labels
def test_update_prj(self): """Test number of documents after updating a project.""" assert len(self.prj.documents) == self.document_count self.prj.update() assert len(self.prj.documents) == self.correct_document_count is_file(self.prj.meta_file_path)
def get_document_details(self, update): """ Get data from a document. :param update: Update the downloaded information even it is already available """ self.annotation_file_path = os.path.join(self.root, 'annotations.json5') self.section_file_path = os.path.join(self.root, 'sections.json5') self.txt_file_path = os.path.join(self.root, 'document.txt') self.hocr_file_path = os.path.join(self.root, 'document.hocr') self.bbox_file_path = os.path.join(self.root, 'bbox.json5') if update or not ( is_file(self.annotation_file_path, raise_exception=False) and is_file(self.section_file_path, raise_exception=False) and is_file(self.txt_file_path, raise_exception=False) and is_file(self.bbox_file_path, raise_exception=False) and is_file(self.pages_file_path, raise_exception=False)): data = get_document_details(document_id=self.id, session=self.session) raw_annotations = data['annotations'] self.number_of_pages = data['number_of_pages'] self.text = data['text'] self.hocr = data['hocr'] or '' self.pages = data['pages'] self._sections = data['sections'] # write a file, even there are no annotations to support offline work with open(self.annotation_file_path, 'w') as f: json.dump(raw_annotations, f, indent=2, sort_keys=True) with open(self.section_file_path, 'w') as f: json.dump(data['sections'], f, indent=2, sort_keys=True) with open(self.txt_file_path, 'w', encoding="utf-8") as f: f.write(data['text']) with open(self.bbox_file_path, 'w') as f: json.dump(data['bbox'], f, indent=2, sort_keys=True) with open(self.pages_file_path, 'w') as f: json.dump(data['pages'], f, indent=2, sort_keys=True) if self.hocr != '': with open(self.hocr_file_path, 'w', encoding="utf-8") as f: f.write(data['hocr']) else: with open(self.txt_file_path, 'r', encoding="utf-8") as f: self.text = f.read() with open(self.annotation_file_path, 'rb') as f: raw_annotations = json.loads(f.read()) with open(self.section_file_path, 'rb') as f: self._sections = json.loads(f.read()) with open(self.pages_file_path, 'rb') as f: self.pages = json.loads(f.read()) if is_file(self.hocr_file_path, raise_exception=False): # hocr might not be available (depends on the project settings) with open(self.hocr_file_path, 'r', encoding="utf-8") as f: self.hocr = f.read() # add Annotations to the document and project if hasattr(self, 'project') and self.project: for raw_annotation in raw_annotations: if not raw_annotation['custom_offset_string']: annotation = self.annotation_class(document=self, **raw_annotation) self.add_annotation(annotation) else: real_string = self.text[raw_annotation['start_offset']: raw_annotation['end_offset']] if real_string.replace( ' ', '') == raw_annotation['offset_string'].replace( ' ', ''): annotation = self.annotation_class(document=self, **raw_annotation) self.add_annotation(annotation) else: logger.warning( f'Annotation {raw_annotation["id"]} is a custom string and, therefore, it will not be used ' f'in training {KONFUZIO_HOST}/a/{raw_annotation["id"]}.' ) return self