def test_clean_header_footer(): converter = PDFToTextConverter() document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header/footer preprocessor = PreProcessor(clean_header_footer=True, split_by=None) documents = preprocessor.process(document) assert len(documents) == 1 assert "This is a header." not in documents[0]["text"] assert "footer" not in documents[0]["text"]
def upload_file_to_document_store( file: UploadFile = File(...), remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES), remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE), remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES), remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER), valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES), split_by: Optional[str] = Form(SPLIT_BY), split_length: Optional[int] = Form(SPLIT_LENGTH), split_overlap: Optional[int] = Form(SPLIT_OVERLAP), split_respect_sentence_boundary: Optional[bool] = Form( SPLIT_RESPECT_SENTENCE_BOUNDARY), ): try: file_path = Path( FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" with file_path.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) if file.filename.split(".")[-1].lower() == "pdf": pdf_converter = PDFToTextConverter( remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, ) document = txt_converter.convert(file_path) else: raise HTTPException( status_code=415, detail=f"Only .pdf and .txt file formats are supported.") document = {TEXT_FIELD_NAME: document["text"], "name": file.filename} preprocessor = PreProcessor( clean_whitespace=remove_whitespace, clean_header_footer=remove_header_footer, clean_empty_lines=remove_empty_lines, split_by=split_by, split_length=split_length, split_overlap=split_overlap, split_respect_sentence_boundary=split_respect_sentence_boundary, ) documents = preprocessor.process(document) document_store.write_documents(documents) return "File upload was successful." finally: file.file.close()
def upload_file( model_id: str = Form(...), file: UploadFile = File(...), remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES)): print("uploading file") if model_id not in MODELS: raise HTTPException(status_code=400, detail="Invalid model id") try: file_path = Path( FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" with file_path.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) if file.filename.split(".")[-1].lower() == "pdf": pdf_converter = PDFToTextConverter( remove_numeric_tables=remove_numeric_tables, ) document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( remove_numeric_tables=remove_numeric_tables) document = txt_converter.convert(file_path) else: raise HTTPException( status_code=415, detail=f"Only .pdf and .txt file formats are supported.") processor = PreProcessor(clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=200, split_respect_sentence_boundary=True) docs = processor.process(document) # Add name field to documents for doc in docs: doc['name'] = file.filename doc_store = MODELS[model_id].finder.retriever.document_store doc_store.write_documents(docs) return docs finally: file.file.close()
def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> List[dict]: """ Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a Document Store. :param dir_path: path for the documents to be written to the DocumentStore :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str) :param split_paragraphs: split text in paragraphs. :return: None """ file_paths = [p for p in Path(dir_path).glob("**/*")] if ".pdf" in [p.suffix.lower() for p in file_paths]: pdf_converter = PDFToTextConverter( ) # type: Optional[PDFToTextConverter] else: pdf_converter = None documents = [] for path in file_paths: if path.suffix.lower() == ".txt": with open(path) as doc: text = doc.read() elif path.suffix.lower() == ".pdf" and pdf_converter: document = pdf_converter.convert(path) text = document["text"] else: raise Exception( f"Indexing of {path.suffix} files is not currently supported.") if clean_func: text = clean_func(text) if split_paragraphs: for para in text.split("\n\n"): if not para.strip(): # skip empty paragraphs continue documents.append({"text": para, "meta": {"name": path.name}}) else: documents.append({"text": text, "meta": {"name": path.name}}) return documents
def upload_file_to_document_store( file: UploadFile = File(...), remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES), remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE), remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES), remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER), valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES), ): try: file_path = Path( FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" with file_path.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) if file.filename.split(".")[-1].lower() == "pdf": pdf_converter = PDFToTextConverter( remove_numeric_tables=remove_numeric_tables, remove_whitespace=remove_whitespace, remove_empty_lines=remove_empty_lines, remove_header_footer=remove_header_footer, valid_languages=valid_languages, ) document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( remove_numeric_tables=remove_numeric_tables, remove_whitespace=remove_whitespace, remove_empty_lines=remove_empty_lines, remove_header_footer=remove_header_footer, valid_languages=valid_languages, ) document = txt_converter.convert(file_path) else: raise HTTPException( status_code=415, detail=f"Only .pdf and .txt file formats are supported.") document_to_write = { TEXT_FIELD_NAME: document["text"], "name": file.filename } document_store.write_documents([document_to_write]) return "File upload was successful." finally: file.file.close()
def file_upload(file): try: file_path = '/tmp/' + file.name + '_tmp' with open(file_path, "wb") as buffer: buffer.write(file.read()) if file.filename.split(".")[-1].lower() == "pdf": pdf_converter = PDFToTextConverter( remove_numeric_tables=True, valid_languages=["en"] ) document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( remove_numeric_tables=True, valid_languages=["en"], ) document = txt_converter.convert(file_path) else: raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") document = {TEXT_FIELD_NAME: document["text"], "name": file.filename} preprocessor = PreProcessor( clean_whitespace=REMOVE_WHITESPACE, clean_header_footer=REMOVE_HEADER_FOOTER, clean_empty_lines=REMOVE_EMPTY_LINES, split_by=SPLIT_BY, split_length=SPLIT_LENGTH, split_respect_sentence_boundary=SPLIT_RESPECT_SENTENCE_BOUNDARY, ) documents = preprocessor.process(document) # write the docs to the DB. document_store.write_documents(documents) return document_store finally: os.remove(file_path) buffer.close()