Esempio n. 1
0
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=True,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        # for i in range(0,len(backagain)):
        #     print(i)
        #     print(":\n")
        #     print(backagain[i])
        #     print("---------------")

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())
Esempio n. 2
0
def test_preprocess_word_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=10,
                                split_stride=0,
                                split_by="word",
                                split_respect_sentence_boundary=False)
    documents = preprocessor.process(document)
    assert len(documents) == 11

    preprocessor = PreProcessor(split_length=15,
                                split_stride=0,
                                split_by="word",
                                split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    for i, doc in enumerate(documents):
        if i == 0:
            assert len(doc["text"].split(" ")) == 14
        assert len(doc["text"].split(" ")) <= 15 or doc["text"].startswith(
            "This is to trick")
    assert len(documents) == 8

    preprocessor = PreProcessor(split_length=40,
                                split_stride=10,
                                split_by="word",
                                split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    assert len(documents) == 5
def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header/footer

    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
    documents = preprocessor.process(document)

    assert len(documents) == 1

    assert "This is a header." not in documents[0]["text"]
    assert "footer" not in documents[0]["text"]
Esempio n. 4
0
def upload_file_to_document_store(
    file: UploadFile = File(...),
    remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES),
    remove_whitespace: Optional[bool] = Form(REMOVE_WHITESPACE),
    remove_empty_lines: Optional[bool] = Form(REMOVE_EMPTY_LINES),
    remove_header_footer: Optional[bool] = Form(REMOVE_HEADER_FOOTER),
    valid_languages: Optional[List[str]] = Form(VALID_LANGUAGES),
    split_by: Optional[str] = Form(SPLIT_BY),
    split_length: Optional[int] = Form(SPLIT_LENGTH),
    split_overlap: Optional[int] = Form(SPLIT_OVERLAP),
    split_respect_sentence_boundary: Optional[bool] = Form(
        SPLIT_RESPECT_SENTENCE_BOUNDARY),
):
    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages)
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
                valid_languages=valid_languages,
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=remove_whitespace,
            clean_header_footer=remove_header_footer,
            clean_empty_lines=remove_empty_lines,
            split_by=split_by,
            split_length=split_length,
            split_overlap=split_overlap,
            split_respect_sentence_boundary=split_respect_sentence_boundary,
        )

        documents = preprocessor.process(document)
        document_store.write_documents(documents)
        return "File upload was successful."
    finally:
        file.file.close()
Esempio n. 5
0
def upload_file(
        model_id: str = Form(...),
        file: UploadFile = File(...),
        remove_numeric_tables: Optional[bool] = Form(REMOVE_NUMERIC_TABLES)):

    print("uploading file")
    if model_id not in MODELS:
        raise HTTPException(status_code=400, detail="Invalid model id")

    try:
        file_path = Path(
            FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
        with file_path.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=remove_numeric_tables, )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables)
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(
                status_code=415,
                detail=f"Only .pdf and .txt file formats are supported.")

        processor = PreProcessor(clean_empty_lines=True,
                                 clean_whitespace=True,
                                 clean_header_footer=True,
                                 split_by="word",
                                 split_length=200,
                                 split_respect_sentence_boundary=True)
        docs = processor.process(document)

        # Add name field to documents
        for doc in docs:
            doc['name'] = file.filename

        doc_store = MODELS[model_id].finder.retriever.document_store
        doc_store.write_documents(docs)

        return docs
    finally:
        file.file.close()
Esempio n. 6
0
def test_eval_data_splitting(document_store):
    # splitting by word
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")

    preprocessor = PreProcessor(clean_empty_lines=False,
                                clean_whitespace=False,
                                clean_header_footer=False,
                                split_by="word",
                                split_length=4,
                                split_overlap=0,
                                split_respect_sentence_boundary=False)

    document_store.add_eval_data(filename="samples/squad/tiny.json",
                                 doc_index="test_eval_document",
                                 label_index="test_feedback",
                                 preprocessor=preprocessor)
    labels = document_store.get_all_labels_aggregated(index="test_feedback")
    docs = document_store.get_all_documents(index="test_eval_document")
    assert len(docs) == 5
    assert len(set(labels[0].multiple_document_ids)) == 2

    # splitting by passage
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")

    preprocessor = PreProcessor(clean_empty_lines=False,
                                clean_whitespace=False,
                                clean_header_footer=False,
                                split_by="passage",
                                split_length=1,
                                split_overlap=0,
                                split_respect_sentence_boundary=False)

    document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
                                 doc_index="test_eval_document",
                                 label_index="test_feedback",
                                 preprocessor=preprocessor)
    docs = document_store.get_all_documents(index="test_eval_document")
    assert len(docs) == 2
    assert len(docs[1].text) == 56
Esempio n. 7
0
def file_upload(file):
    try:
        file_path = '/tmp/' + file.name + '_tmp'
        with open(file_path, "wb") as buffer:
            buffer.write(file.read())

        if file.filename.split(".")[-1].lower() == "pdf":
            pdf_converter = PDFToTextConverter(
                remove_numeric_tables=True, valid_languages=["en"]
            )
            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=True, valid_languages=["en"],
            )
            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")

        document = {TEXT_FIELD_NAME: document["text"], "name": file.filename}

        preprocessor = PreProcessor(
            clean_whitespace=REMOVE_WHITESPACE,
            clean_header_footer=REMOVE_HEADER_FOOTER,
            clean_empty_lines=REMOVE_EMPTY_LINES,
            split_by=SPLIT_BY,
            split_length=SPLIT_LENGTH,
            split_respect_sentence_boundary=SPLIT_RESPECT_SENTENCE_BOUNDARY,
        )

        documents = preprocessor.process(document)


        # write the docs to the DB.
        document_store.write_documents(documents)
        return document_store
    finally:
        os.remove(file_path)
        buffer.close()
def test_preprocess_passage_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False)
    documents = preprocessor.process(document)
    assert len(documents) == 3

    preprocessor = PreProcessor(split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False)
    documents = preprocessor.process(document)
    assert len(documents) == 2
def test_preprocess_sentence_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence")
    documents = preprocessor.process(document)
    assert len(documents) == 15

    preprocessor = PreProcessor(
        split_length=10, split_overlap=0, split_by="sentence"
    )
    documents = preprocessor.process(document)
    assert len(documents) == 2
Esempio n. 10
0
def test_preprocess_passage_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=1,
                                split_stride=0,
                                split_by="passage")
    documents = preprocessor.process(document)
    assert len(documents) == 3

    preprocessor = PreProcessor(split_length=2,
                                split_stride=0,
                                split_by="passage")
    documents = preprocessor.process(document)
    assert len(documents) == 2
def test_eval_data_split_passage(document_store):
    # splitting by passage
    preprocessor = PreProcessor(clean_empty_lines=False,
                                clean_whitespace=False,
                                clean_header_footer=False,
                                split_by="passage",
                                split_length=1,
                                split_overlap=0,
                                split_respect_sentence_boundary=False)

    document_store.add_eval_data(
        filename="samples/squad/tiny_passages.json",
        doc_index="haystack_test_eval_document",
        label_index="haystack_test_feedback",
        preprocessor=preprocessor,
    )
    docs = document_store.get_all_documents(
        index="haystack_test_eval_document")
    assert len(docs) == 2
    assert len(docs[1].text) == 56
Esempio n. 12
0
def _extract_docs_and_labels_from_dict(document_dict: Dict,
                                       preprocessor: PreProcessor = None):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        ## Create Metadata
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)

        ## Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        if preprocessor is not None:
            splits_dicts = preprocessor.process(cur_doc.to_dict())
            # we need to pull in _split_id into the document id for unique reference in labels
            # todo: PreProcessor should work on Documents instead of dicts
            splits = []
            offset = 0
            for d in splits_dicts:
                id = f"{d['id']}-{d['meta']['_split_id']}"
                d["meta"]["_split_offset"] = offset
                offset += len(d["text"])
                # offset correction based on splitting method
                if preprocessor.split_by == "word":
                    offset += 1
                elif preprocessor.split_by == "passage":
                    offset += 2
                else:
                    raise NotImplementedError
                mydoc = Document(text=d["text"], id=id, meta=d["meta"])
                splits.append(mydoc)
        else:
            splits = [cur_doc]
        docs.extend(splits)

        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
            if not qa["is_impossible"]:
                for answer in qa["answers"]:
                    ans = answer["text"]
                    ans_position = cur_doc.text[
                        answer["answer_start"]:answer["answer_start"] +
                        len(ans)]
                    if ans != ans_position:
                        logger.warning(
                            f"Answer Text and Answer position mismatch. Skipping Answer"
                        )
                        break
                    # find corresponding document or split
                    if len(splits) == 1:
                        cur_id = splits[0].id
                        cur_ans_start = answer["answer_start"]
                    else:
                        for s in splits:
                            # If answer start offset is contained in passage we assign the label to that passage
                            if (answer["answer_start"] >=
                                    s.meta["_split_offset"]) and (
                                        answer["answer_start"] <
                                        (s.meta["_split_offset"] +
                                         len(s.text))):
                                cur_id = s.id
                                cur_ans_start = answer[
                                    "answer_start"] - s.meta["_split_offset"]
                                # If a document is splitting an answer we add the whole answer text to the document
                                if s.text[cur_ans_start:cur_ans_start +
                                          len(ans)] != ans:
                                    s.text = s.text[:cur_ans_start] + ans
                                break
                    label = Label(
                        question=qa["question"],
                        answer=ans,
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_id,
                        offset_start_in_doc=cur_ans_start,
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                # for no_answer we need to assign each split as not fitting to the question
                for s in splits:
                    label = Label(
                        question=qa["question"],
                        answer="",
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=s.id,
                        offset_start_in_doc=0,
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)

    return docs, labels
def tutorial8_preprocessing():
    # This fetches some sample files to work with

    doc_dir = "data/preprocessing_tutorial"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    """
    ## Converters
    
    Haystack's converter classes are designed to help you turn files on your computer into the documents
    that can be processed by the Haystack pipeline.
    There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
    """

    # Here are some examples of how you would use file converters

    converter = TextConverter(remove_numeric_tables=True,
                              valid_languages=["en"])
    doc_txt = converter.convert(
        file_path="data/preprocessing_tutorial/classics.txt", meta=None)

    converter = PDFToTextConverter(remove_numeric_tables=True,
                                   valid_languages=["en"])
    doc_pdf = converter.convert(
        file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

    converter = DocxToTextConverter(remove_numeric_tables=True,
                                    valid_languages=["en"])
    doc_docx = converter.convert(
        file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

    # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.

    all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
    """
    
    ## PreProcessor
    
    The PreProcessor class is designed to help you clean text and split text into sensible units.
    File splitting can have a very significant impact on the system's performance.
    Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
    and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
    """

    # This is a default usage of the PreProcessor.
    # Here, it performs cleaning of consecutive whitespaces
    # and splits a single large document into smaller documents.
    # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
    # Note how the single document passed into the document gets split into 5 smaller documents

    preprocessor = PreProcessor(clean_empty_lines=True,
                                clean_whitespace=True,
                                clean_header_footer=False,
                                split_by="word",
                                split_length=1000,
                                split_respect_sentence_boundary=True)
    docs_default = preprocessor.process(doc_txt)
    print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
    """
    ## Cleaning
    
    - `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
    - `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
    - `clean_header_footer` will remove any long header or footer texts that are repeated on each page
    
    ## Splitting
    By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
    midway through a sentence.
    This will help reduce the possibility of answer phrases being split between two documents.
    This feature can be turned off by setting `split_respect_sentence_boundary=False`.
    """

    # Not respecting sentence boundary vs respecting sentence boundary

    preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
    docs_nrsb = preprocessor_nrsb.process(doc_txt)

    print("RESPECTING SENTENCE BOUNDARY")
    end_text = docs_default[0]["text"][-50:]
    print("End of document: \"..." + end_text + "\"")
    print()
    print("NOT RESPECTING SENTENCE BOUNDARY")
    end_text_nrsb = docs_nrsb[0]["text"][-50:]
    print("End of document: \"..." + end_text_nrsb + "\"")
    """
    A commonly used strategy to split long documents, especially in the field of Question Answering,
    is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:
    
    - doc1 = words[0:10]
    - doc2 = words[7:17]
    - doc3 = words[14:24]
    - ...
    
    You can use this strategy by following the code below.
    """

    # Sliding window approach

    preprocessor_sliding_window = PreProcessor(
        split_overlap=3,
        split_length=10,
        split_respect_sentence_boundary=False)
    docs_sliding_window = preprocessor_sliding_window.process(doc_txt)

    doc1 = docs_sliding_window[0]["text"][:200]
    doc2 = docs_sliding_window[1]["text"][:100]
    doc3 = docs_sliding_window[2]["text"][:100]

    print("Document 1: \"" + doc1 + "...\"")
    print("Document 2: \"" + doc2 + "...\"")
    print("Document 3: \"" + doc3 + "...\"")
Esempio n. 14
0
#doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

#converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
#doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

# in our case:
converter = TextConverter(remove_numeric_tables=True, valid_languages=["de"])
doc_txt = converter.convert(file_path="./data/geschichte_19._Jahrhundert.txt",
                            meta=None)
# TODO: Scraping is not correct yet. E.g. Code civil is incorrect (text after it is left out)

# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents

preprocessor = PreProcessor(clean_empty_lines=True,
                            clean_whitespace=True,
                            clean_header_footer=False,
                            split_by="word",
                            split_length=100,
                            split_respect_sentence_boundary=True)
#    clean_empty_lines will normalize 3 or more consecutive empty lines to be just a two empty lines
#    clean_whitespace will remove any whitespace at the beginning or end of each line in the text
#    clean_header_footer will remove any long header or footer texts that are repeated on each page

docs_default = preprocessor.process(doc_txt)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")

# EOF
Esempio n. 15
0
def _extract_docs_and_labels_from_dict(document_dict: Dict,
                                       preprocessor: PreProcessor = None,
                                       open_domain: bool = False):
    """Set open_domain to True if you are trying to load open_domain labels (i.e. labels without doc id or start idx)"""
    docs = []
    labels = []
    problematic_ids = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        ## Create Metadata
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)

        ## Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        if preprocessor is not None:
            splits_dicts = preprocessor.process(cur_doc.to_dict())
            # we need to pull in _split_id into the document id for unique reference in labels
            # todo: PreProcessor should work on Documents instead of dicts
            splits: List[Document] = []
            offset = 0
            for d in splits_dicts:
                id = f"{d['id']}-{d['meta']['_split_id']}"
                d["meta"]["_split_offset"] = offset
                offset += len(d["text"])
                # offset correction based on splitting method
                if preprocessor.split_by == "word":
                    offset += 1
                elif preprocessor.split_by == "passage":
                    offset += 2
                else:
                    raise NotImplementedError
                mydoc = Document(text=d["text"], id=id, meta=d["meta"])
                splits.append(mydoc)
        else:
            splits = [cur_doc]
        docs.extend(splits)

        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
            if not qa.get("is_impossible", False):
                for answer in qa["answers"]:
                    ans = answer["text"]
                    cur_ans_start = None
                    # TODO The following block of code means that answer_start is never calculated
                    #  and cur_id is always None for open_domain
                    #  This can be rewritten so that this function could try to calculate offsets
                    #  and populate id in open_domain mode
                    if open_domain:
                        cur_ans_start = answer.get("answer_start", 0)
                        cur_id = '0'
                    else:
                        ans_position = cur_doc.text[
                            answer["answer_start"]:answer["answer_start"] +
                            len(ans)]
                        if ans != ans_position:
                            # do not use answer
                            problematic_ids.append(qa.get("id", "missing"))
                            break
                        # find corresponding document or split
                        if len(splits) == 1:
                            cur_id = splits[0].id
                            cur_ans_start = answer["answer_start"]
                        else:
                            for s in splits:
                                # If answer start offset is contained in passage we assign the label to that passage
                                if (answer["answer_start"] >=
                                        s.meta["_split_offset"]) and (
                                            answer["answer_start"] <
                                            (s.meta["_split_offset"] +
                                             len(s.text))):
                                    cur_id = s.id
                                    cur_ans_start = answer[
                                        "answer_start"] - s.meta[
                                            "_split_offset"]
                                    # If a document is splitting an answer we add the whole answer text to the document
                                    if s.text[cur_ans_start:cur_ans_start +
                                              len(ans)] != ans:
                                        s.text = s.text[:cur_ans_start] + ans
                                    break
                    label = Label(
                        question=qa["question"],
                        answer=ans,
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_id,
                        offset_start_in_doc=cur_ans_start,
                        no_answer=qa.get("is_impossible", False),
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                # for no_answer we need to assign each split as not fitting to the question
                for s in splits:
                    label = Label(
                        question=qa["question"],
                        answer="",
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=s.id,
                        offset_start_in_doc=0,
                        no_answer=qa.get("is_impossible", False),
                        origin="gold_label",
                    )
                    labels.append(label)

    return docs, labels, problematic_ids
Esempio n. 16
0
    parser.add_argument("--num_hard_negative_ctxs",
                        dest="num_hard_negative_ctxs",
                        help="Number of hard negative contexts to use",
                        metavar="num_hard_negative_ctxs",
                        default=30)
    parser.add_argument(
        "--split_dataset",
        dest="split_dataset",
        action="store_true",
        help="Whether to split the created dataset or not (default: False)",
    )

    args = parser.parse_args()

    preprocessor = PreProcessor(split_length=100,
                                split_overlap=0,
                                clean_empty_lines=False,
                                clean_whitespace=False)
    squad_input_filename = Path(args.squad_input_filename)
    dpr_output_filename = Path(args.dpr_output_filename)
    num_hard_negative_ctxs = args.num_hard_negative_ctxs
    split_dataset = args.split_dataset

    retriever_dpr_config = {
        "use_gpu": True,
    }
    store_dpr_config = {
        "embedding_field": "embedding",
        "embedding_dim": 768,
    }

    retriever_bm25_config: dict = {}
Esempio n. 17
0
# remove linebreaks
df['summary'] = df['summary'].astype(str).apply(clean.remove_linebreak)
df['title'] = df['title'].astype(str).apply(clean.remove_linebreak)

# Dataframe to dict for haystack
all_dicts = df[['title', 'summary']].rename(columns={
    'title': 'name',
    'summary': 'text'
}).to_dict(orient='records')
# %%
# clean data
# preprocessing from haystack
preprocessor = PreProcessor(clean_empty_lines=True,
                            clean_whitespace=True,
                            clean_header_footer=False,
                            split_by="word",
                            split_length=100,
                            split_respect_sentence_boundary=True,
                            split_overlap=10)
nested_docs = [preprocessor.process(d) for d in all_dicts]
docs = [d for x in nested_docs for d in x]
# %%
# start FAISS document store and store docs
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

document_store.write_documents(docs)

# %%
# initialise storage
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(
Esempio n. 18
0
The PreProcessor class is designed to help you clean text and split text into sensible units.
File splitting can have a very significant impact on the system's performance.
Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
"""

# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents

preprocessor = PreProcessor(clean_empty_lines=True,
                            clean_whitespace=True,
                            clean_header_footer=False,
                            split_by="word",
                            split_length=1000,
                            split_respect_sentence_boundary=True)
docs_default = preprocessor.process(doc_txt)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
"""
## Cleaning

- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
- `clean_header_footer` will remove any long header or footer texts that are repeated on each page

## Splitting
By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
midway through a sentence.
This will help reduce the possibility of answer phrases being split between two documents.
                        dest="num_hard_negative_ctxs",
                        help="Number of hard negative contexts to use",
                        metavar="num_hard_negative_ctxs",
                        default=30)
    parser.add_argument(
        "--split_dataset",
        dest="split_dataset",
        action="store_true",
        help="Whether to split the created dataset or not (default: False)",
    )

    args = parser.parse_args()

    preprocessor = PreProcessor(split_length=100,
                                split_overlap=0,
                                clean_empty_lines=False,
                                split_respect_sentence_boundary=False,
                                clean_whitespace=False)
    squad_input_filename = Path(args.squad_input_filename)
    dpr_output_filename = Path(args.dpr_output_filename)
    num_hard_negative_ctxs = args.num_hard_negative_ctxs
    split_dataset = args.split_dataset

    retriever_dpr_config = {
        "use_gpu": True,
    }
    store_dpr_config = {
        "embedding_field": "embedding",
        "embedding_dim": 768,
    }