def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header/footer

    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
    documents = preprocessor.process(document)

    assert len(documents) == 1

    assert "This is a header." not in documents[0]["text"]
    assert "footer" not in documents[0]["text"]
Esempio n. 2
0
def upload_file_to_document_store(
    file: UploadFile = File(...),
    remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
    remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
    remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
    remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
    valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
    split_by: Optional[str] = Form(SPLIT_BY),
    split_length: Optional[int] = Form(SPLIT_LENGTH),
    split_overlap: Optional[int] = Form(SPLIT_OVERLAP),
    split_respect_sentence_boundary: Optional[bool] = Form(
        SPLIT_RESPECT_SENTENCE_BOUNDARY),
):
    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages)
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages,
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=remove_whitespace,
            clean_header_footer=remove_header_footer,
            clean_empty_lines=remove_empty_lines,
            split_by=split_by,
            split_length=split_length,
            split_overlap=split_overlap,
            split_respect_sentence_boundary=split_respect_sentence_boundary,
        )

        documents = preprocessor.process(document)
        document_store.write_documents(documents)
        return "File upload was successful."
    finally:
        file.file.close()
Esempio n. 3
0
def upload_file(
        model_id: str = Form(...),
        file: UploadFile = File(...),
        remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES)):

    print("uploading file")
    if model_id not in MODELS:
        raise HTTPException(status_code=400, detail="Invalid model id")

    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables, )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables)
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        processor = PreProcessor(clean_empty_lines=True,
                                 clean_whitespace=True,
                                 clean_header_footer=True,
                                 split_by="word",
                                 split_length=200,
                                 split_respect_sentence_boundary=True)
        docs = processor.process(document)

        # Add name field to documents
        for doc in docs:
            doc['name'] = file.filename

        doc_store = MODELS[model_id].finder.retriever.document_store
        doc_store.write_documents(docs)

        return docs
    finally:
        file.file.close()
Esempio n. 4
0
def convert_files_to_dicts(dir_path: str,
                           clean_func: Optional[Callable] = None,
                           split_paragraphs: bool = False) -> List[dict]:
    """
    Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
    Document Store.

    :param dir_path: path for the documents to be written to the DocumentStore
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.

    :return: None
    """

    file_paths = [p for p in Path(dir_path).glob("**/*")]
    if ".pdf" in [p.suffix.lower() for p in file_paths]:
        pdf_converter = PDFToTextConverter(
        )  # type: Optional[PDFToTextConverter]
    else:
        pdf_converter = None

    documents = []
    for path in file_paths:
        if path.suffix.lower() == ".txt":
            with open(path) as doc:
                text = doc.read()
        elif path.suffix.lower() == ".pdf" and pdf_converter:
            document = pdf_converter.convert(path)
            text = document["text"]
        else:
            raise Exception(
                f"Indexing of {path.suffix} files is not currently supported.")

        if clean_func:
            text = clean_func(text)

        if split_paragraphs:
            for para in text.split("\n\n"):
                if not para.strip():  # skip empty paragraphs
                    continue
                documents.append({"text": para, "meta": {"name": path.name}})
        else:
            documents.append({"text": text, "meta": {"name": path.name}})

    return documents
Esempio n. 5
0
def upload_file_to_document_store(
        file: UploadFile = File(...),
        remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
        remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
        remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
        remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
        valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
):
    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        document_to_write = {
            TEXT_FIELD_NAME: document["text"],
            "name": file.filename
        }
        document_store.write_documents([document_to_write])
        return "File upload was successful."
    finally:
        file.file.close()
Esempio n. 6
0
def file_upload(file):
    try:
        file_path = '/tmp/' + file.name + '_tmp'
        with open(file_path, "wb") as buffer:
            buffer.write(file.read())

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=True, valid_languages=["en"]
            )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=True, valid_languages=["en"],
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=REMOVE_WHITESPACE,
            clean_header_footer=REMOVE_HEADER_FOOTER,
            clean_empty_lines=REMOVE_EMPTY_LINES,
            split_by=SPLIT_BY,
            split_length=SPLIT_LENGTH,
            split_respect_sentence_boundary=SPLIT_RESPECT_SENTENCE_BOUNDARY,
        )

        documents = preprocessor.process(document)


        # write the docs to the DB.
        document_store.write_documents(documents)
        return document_store
    finally:
        os.remove(file_path)
        buffer.close()