def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: """ :param tmpdir: directory where file is located :param filename: Name of the file from which you should extract attachments :param parameters: dict with different parameters for extracting :return: list of lists (name of original file and binary file content) """ result = [] name, ext = splitext_(filename) if ext == '.docx': with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: files = zfile.namelist() attachments = [ file for file in files if file.startswith("word/media/") ] attachments += [ file for file in files if file.startswith("word/embeddings/") ] try: for attachment in attachments: original_name = os.path.split(attachment)[-1] if not original_name.endswith( '.emf') and not original_name.endswith('.bin'): result.append( (original_name, zfile.read(attachment))) elif original_name.endswith('.bin'): # extracting PDF-files with zfile.open(attachment) as f: ole = olefile.OleFileIO(f.read()) if ole.exists("CONTENTS"): data = ole.openstream('CONTENTS').read() if data[0:5] == b'%PDF-': result.append( (os.path.splitext(original_name)[-2] + '.pdf', data)) # extracting files in other formats elif ole.exists("\x01Ole10Native"): data = ole.openstream("\x01Ole10Native").read() original_name, contents = self.__parse_ole_contents( data) result.append((original_name, contents)) attachments = self._content2attach_file(content=result, tmpdir=tmpdir) diagram_attachments = self.__extract_diagrams(zfile) attachments += self._content2attach_file( content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=False) assert len(attachments) == 0 or isinstance( attachments[0], AttachedFile) return attachments except Exception as error: print(error) return []
def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> [UnstructuredDocument, bool]: name, extension = splitext_(filename) file_path = os.path.join(tmp_dir, filename) mime = get_file_mime_type(file_path) document_type = parameters.get("document_type") for reader in self.readers: if reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type): unstructured_document, need_analyze_attachments = reader.read( path=file_path, document_type=document_type, parameters=parameters) assert len(unstructured_document.lines) == 0 or isinstance( unstructured_document.lines[0], LineWithMeta) assert isinstance(unstructured_document, UnstructuredDocument) # TODO remove return unstructured_document, need_analyze_attachments raise BadFileFormatException( msg= "no one can read file: name = {}, extension = {}, mime = {}, document type = {}" .format(filename, extension, mime, document_type), msg_api="Unsupported file format {} of the input file {}".format( mime, filename))
def do_converting(self, tmp_dir: str, filename: str) -> str: name, extension = splitext_(filename) mime = get_file_mime_type(os.path.join(tmp_dir, filename)) for converter in self.converters: if converter.can_convert(extension=extension, mime=mime): filename = converter.do_convert(tmp_dir, name, extension) break file_path = os.path.join(tmp_dir, filename) os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH) return filename
def can_extract(self, mime: str, filename: str) -> bool: """ Check if this Extractor can handle given file. :param mime: mime type of the file. :param filename: name of the file with extension. :return: True if this extractor can handle given file, False otherwise """ if mime in recognized_mimes.docx_like_format: name, ext = splitext_(filename) return ext == '.docx' return False
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[List[Union[str, bytes]]]: """ :param tmpdir: directory where file is located :param filename: Name of the file from which you should extract attachments :param parameters: dict with different parameters for extracting :return: list of lists (name of original file and binary file content) """ result = [] name, ext = splitext_(filename) if ext == '.docx': with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: files = zfile.namelist() attachments = [ file for file in files if file.startswith("word/media/") ] attachments += [ file for file in files if file.startswith("word/embeddings/") ] try: for attachment in attachments: namefile = os.path.split(attachment)[-1] if not namefile.endswith( '.emf') and not namefile.endswith('.bin'): result.append([namefile, zfile.read(attachment)]) elif namefile.endswith('.bin'): # extracting PDF-files with zfile.open(attachment) as f: ole = olefile.OleFileIO(f.read()) if ole.exists("CONTENTS"): data = ole.openstream('CONTENTS').read() if data[0:5] == b'%PDF-': result.append([ os.path.splitext(namefile)[-2] + '.pdf', data ]) # extracting files in other formats elif ole.exists("\x01Ole10Native"): data = ole.openstream("\x01Ole10Native").read() namefile, contents = self.__parse_ole_contents( data) result.append([namefile, contents]) except Exception as error: print(error) return result
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[List[Union[str, bytes]]]: attachments = [] name, ext = splitext_(filename) if ext == '.xlsx': with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: name_zip, *files = zfile.namelist() print(name_zip) medias = [file for file in files if file.startswith("xl/media/")] for media in medias: namefile = os.path.split(media)[-1] attachments.append([namefile, zfile.read(media)]) return attachments
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: attachments = [] name, ext = splitext_(filename) if ext == '.xlsx': with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: name_zip, *files = zfile.namelist() print(name_zip) medias = [ file for file in files if file.startswith("xl/media/") ] for media in medias: namefile = os.path.split(media)[-1] attachments.append((namefile, zfile.read(media))) return self._content2attach_file(content=attachments, tmpdir=tmpdir)
def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] = None) -> str: name, extension = splitext_(filename) mime = get_file_mime_type(os.path.join(tmp_dir, filename)) for converter in self.converters: if "parameters" in inspect.getfullargspec(converter.can_convert).args: can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) else: warnings.warn("!WARNING! you converter requires an update\n" + "Please specify parameters argument in method can_convert in {}\n".format( type(converter).__name__) + " This parameters would be mandatory in the near future") can_convert = converter.can_convert(extension=extension, mime=mime) if can_convert: filename = converter.do_convert(tmp_dir, name, extension) break file_path = os.path.join(tmp_dir, filename) os.chmod(file_path, S_IREAD | S_IRGRP | S_IROTH) return filename
def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> UnstructuredDocument: name, extension = splitext_(filename) file_path = os.path.join(tmp_dir, filename) mime = get_file_mime_type(file_path) document_type = parameters.get("document_type") for reader in self.readers: if "parameters" in inspect.getfullargspec(reader.can_read).args: can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type, parameters=parameters) else: warnings.warn( "!WARNING! you reader requires an update\n" + "Please specify parameters argument in method can_read in {}\n" .format(reader) + " This parameters would be mandatory in the near future") can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type) if can_read: unstructured_document = reader.read( path=file_path, document_type=document_type, parameters=parameters) assert len(unstructured_document.lines) == 0 or isinstance( unstructured_document.lines[0], LineWithMeta) assert isinstance(unstructured_document, UnstructuredDocument) # TODO remove return unstructured_document raise BadFileFormatException( msg= "no one can read file: name = {}, extension = {}, mime = {}, document type = {}" .format(filename, extension, mime, document_type), msg_api="Unsupported file format {} of the input file {}".format( mime, filename))
def can_extract(self, mime: str, filename: str) -> bool: if mime in recognized_mimes.excel_like_format: name, ext = splitext_(filename) return ext == '.xlsx' return False