Ejemplo n.º 1
0
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=True,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        # for i in range(0,len(backagain)):
        #     print(i)
        #     print(":\n")
        #     print(backagain[i])
        #     print("---------------")

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())
Ejemplo n.º 2
0
def upload_file_to_document_store(
    file: UploadFile = File(...),
    remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
    remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
    remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
    remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
    valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
    split_by: Optional[str] = Form(SPLIT_BY),
    split_length: Optional[int] = Form(SPLIT_LENGTH),
    split_overlap: Optional[int] = Form(SPLIT_OVERLAP),
    split_respect_sentence_boundary: Optional[bool] = Form(
        SPLIT_RESPECT_SENTENCE_BOUNDARY),
):
    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages)
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages,
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=remove_whitespace,
            clean_header_footer=remove_header_footer,
            clean_empty_lines=remove_empty_lines,
            split_by=split_by,
            split_length=split_length,
            split_overlap=split_overlap,
            split_respect_sentence_boundary=split_respect_sentence_boundary,
        )

        documents = preprocessor.process(document)
        document_store.write_documents(documents)
        return "File upload was successful."
    finally:
        file.file.close()
Ejemplo n.º 3
0
def upload_file(
        model_id: str = Form(...),
        file: UploadFile = File(...),
        remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES)):

    print("uploading file")
    if model_id not in MODELS:
        raise HTTPException(status_code=400, detail="Invalid model id")

    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables, )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables)
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        processor = PreProcessor(clean_empty_lines=True,
                                 clean_whitespace=True,
                                 clean_header_footer=True,
                                 split_by="word",
                                 split_length=200,
                                 split_respect_sentence_boundary=True)
        docs = processor.process(document)

        # Add name field to documents
        for doc in docs:
            doc['name'] = file.filename

        doc_store = MODELS[model_id].finder.retriever.document_store
        doc_store.write_documents(docs)

        return docs
    finally:
        file.file.close()
Ejemplo n.º 4
0
def upload_file_to_document_store(
        file: UploadFile = File(...),
        remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
        remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
        remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
        remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
        valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
):
    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                remove_whitespace=remove_whitespace,
                remove_empty_lines=remove_empty_lines,
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        document_to_write = {
            TEXT_FIELD_NAME: document["text"],
            "name": file.filename
        }
        document_store.write_documents([document_to_write])
        return "File upload was successful."
    finally:
        file.file.close()
Ejemplo n.º 5
0
def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> \
        List[dict]:
    """
    Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
    Document Store.

    :param dir_path: path for the documents to be written to the DocumentStore
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.

    :return: None
    """

    file_paths = [p for p in Path(dir_path).glob("**/*")]
    allowed_suffixes = [".pdf", ".txt", ".docx"]
    suffix2converter: Dict[str, BaseConverter] = {}

    suffix2paths: Dict[str, List[Path]] = {}
    for path in file_paths:
        file_suffix = path.suffix.lower()
        if file_suffix in allowed_suffixes:
            if file_suffix not in suffix2paths:
                suffix2paths[file_suffix] = []
            suffix2paths[file_suffix].append(path)
        elif not path.is_dir():
            logger.warning('Skipped file {0} as type {1} is not supported here. '
                           'See haystack.file_converter for support of more file types'.format(path, file_suffix))

    # No need to initialize converter if file type not present
    for file_suffix in suffix2paths.keys():
        if file_suffix == ".pdf":
            suffix2converter[file_suffix] = PDFToTextConverter()
        if file_suffix == ".txt":
            suffix2converter[file_suffix] = TextConverter()
        if file_suffix == ".docx":
            suffix2converter[file_suffix] = DocxToTextConverter()

    documents = []
    for suffix, paths in suffix2paths.items():
        for path in paths:
            logger.info('Converting {}'.format(path))
            document = suffix2converter[suffix].convert(file_path=path, meta=None)
            text = document["text"]

            if clean_func:
                text = clean_func(text)

            if split_paragraphs:
                for para in text.split("\n\n"):
                    if not para.strip():  # skip empty paragraphs
                        continue
                    documents.append({"text": para, "meta": {"name": path.name}})
            else:
                documents.append({"text": text, "meta": {"name": path.name}})

    return documents
Ejemplo n.º 6
0
def file_upload(file):
    try:
        file_path = '/tmp/' + file.name + '_tmp'
        with open(file_path, "wb") as buffer:
            buffer.write(file.read())

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=True, valid_languages=["en"]
            )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=True, valid_languages=["en"],
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=REMOVE_WHITESPACE,
            clean_header_footer=REMOVE_HEADER_FOOTER,
            clean_empty_lines=REMOVE_EMPTY_LINES,
            split_by=SPLIT_BY,
            split_length=SPLIT_LENGTH,
            split_respect_sentence_boundary=SPLIT_RESPECT_SENTENCE_BOUNDARY,
        )

        documents = preprocessor.process(document)


        # write the docs to the DB.
        document_store.write_documents(documents)
        return document_store
    finally:
        os.remove(file_path)
        buffer.close()
Ejemplo n.º 7
0
def tutorial8_preprocessing():
    # This fetches some sample files to work with

    doc_dir = "data/preprocessing_tutorial"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    """
    ## Converters
    
    Haystack's converter classes are designed to help you turn files on your computer into the documents
    that can be processed by the Haystack pipeline.
    There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
    """

    # Here are some examples of how you would use file converters

    converter = TextConverter(remove_numeric_tables=True,
                              valid_languages=["en"])
    doc_txt = converter.convert(
        file_path="data/preprocessing_tutorial/classics.txt", meta=None)

    converter = PDFToTextConverter(remove_numeric_tables=True,
                                   valid_languages=["en"])
    doc_pdf = converter.convert(
        file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

    converter = DocxToTextConverter(remove_numeric_tables=True,
                                    valid_languages=["en"])
    doc_docx = converter.convert(
        file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

    # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.

    all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
    """
    
    ## PreProcessor
    
    The PreProcessor class is designed to help you clean text and split text into sensible units.
    File splitting can have a very significant impact on the system's performance.
    Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
    and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
    """

    # This is a default usage of the PreProcessor.
    # Here, it performs cleaning of consecutive whitespaces
    # and splits a single large document into smaller documents.
    # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
    # Note how the single document passed into the document gets split into 5 smaller documents

    preprocessor = PreProcessor(clean_empty_lines=True,
                                clean_whitespace=True,
                                clean_header_footer=False,
                                split_by="word",
                                split_length=1000,
                                split_respect_sentence_boundary=True)
    docs_default = preprocessor.process(doc_txt)
    print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
    """
    ## Cleaning
    
    - `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
    - `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
    - `clean_header_footer` will remove any long header or footer texts that are repeated on each page
    
    ## Splitting
    By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
    midway through a sentence.
    This will help reduce the possibility of answer phrases being split between two documents.
    This feature can be turned off by setting `split_respect_sentence_boundary=False`.
    """

    # Not respecting sentence boundary vs respecting sentence boundary

    preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
    docs_nrsb = preprocessor_nrsb.process(doc_txt)

    print("RESPECTING SENTENCE BOUNDARY")
    end_text = docs_default[0]["text"][-50:]
    print("End of document: \"..." + end_text + "\"")
    print()
    print("NOT RESPECTING SENTENCE BOUNDARY")
    end_text_nrsb = docs_nrsb[0]["text"][-50:]
    print("End of document: \"..." + end_text_nrsb + "\"")
    """
    A commonly used strategy to split long documents, especially in the field of Question Answering,
    is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:
    
    - doc1 = words[0:10]
    - doc2 = words[7:17]
    - doc3 = words[14:24]
    - ...
    
    You can use this strategy by following the code below.
    """

    # Sliding window approach

    preprocessor_sliding_window = PreProcessor(
        split_overlap=3,
        split_length=10,
        split_respect_sentence_boundary=False)
    docs_sliding_window = preprocessor_sliding_window.process(doc_txt)

    doc1 = docs_sliding_window[0]["text"][:200]
    doc2 = docs_sliding_window[1]["text"][:100]
    doc3 = docs_sliding_window[2]["text"][:100]

    print("Document 1: \"" + doc1 + "...\"")
    print("Document 2: \"" + doc2 + "...\"")
    print("Document 3: \"" + doc3 + "...\"")
Ejemplo n.º 8
0
from haystack.file_converter.txt import TextConverter
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.docx import DocxToTextConverter

from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.preprocessor.preprocessor import PreProcessor

# fetch exemplary data to compare data with
doc_dir = "./data/preprocessing_tutorial"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Here are some examples of how you would use file converters

converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(
    file_path="data/preprocessing_tutorial/classics.txt", meta=None)

#converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
#doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

#converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
#doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

# in our case:
converter = TextConverter(remove_numeric_tables=True, valid_languages=["de"])
doc_txt = converter.convert(file_path="./data/geschichte_19._Jahrhundert.txt",
                            meta=None)
# TODO: Scraping is not correct yet. E.g. Code civil is incorrect (text after it is left out)
Ejemplo n.º 9
0
# This fetches some sample files to work with

doc_dir = "data/preprocessing_tutorial"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
"""
## Converters

Haystack's converter classes are designed to help you turn files on your computer into the documents
that can be processed by the Haystack pipeline.
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
"""

# Here are some examples of how you would use file converters

converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(
    file_path="data/preprocessing_tutorial/classics.txt", meta=None)

converter = PDFToTextConverter(remove_numeric_tables=True,
                               valid_languages=["en"])
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf",
                            meta=None)

converter = DocxToTextConverter(remove_numeric_tables=True,
                                valid_languages=["en"])
doc_docx = converter.convert(
    file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.