Ejemplo n.º 1
0
def test_faiss_retrieving(index_factory):
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db",
        faiss_index_factory_str=index_factory)
    document_store.delete_all_documents(index="document")
    if "ivf" in index_factory.lower():
        document_store.train_index(DOCUMENTS)
    document_store.write_documents(DOCUMENTS)
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)
    result = retriever.retrieve(query="How to test this?")
    assert len(result) == len(DOCUMENTS)
    assert type(result[0]) == Document
Ejemplo n.º 2
0
def test_faiss_passing_index_from_outside():
    d = 768
    nlist = 2
    quantizer = faiss.IndexFlatIP(d)
    faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                     faiss.METRIC_INNER_PRODUCT)
    faiss_index.nprobe = 2
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index)

    document_store.delete_all_documents(index="document")
    # as it is a IVF index we need to train it before adding docs
    document_store.train_index(DOCUMENTS)

    document_store.write_documents(documents=DOCUMENTS, index="document")
    documents_indexed = document_store.get_all_documents(index="document")

    # test document correctness
    check_data_correctness(documents_indexed, DOCUMENTS)
Ejemplo n.º 3
0
def test_faiss_passing_index_from_outside():
    d = 768
    nlist = 2
    quantizer = faiss.IndexFlatIP(d)
    faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                     faiss.METRIC_INNER_PRODUCT)
    faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)
    faiss_index.nprobe = 2
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index)

    document_store.delete_all_documents(index="document")
    # as it is a IVF index we need to train it before adding docs
    document_store.train_index(DOCUMENTS)

    document_store.write_documents(documents=DOCUMENTS, index="document")
    documents_indexed = document_store.get_all_documents(index="document")

    # test if vectors ids are associated with docs
    for doc in documents_indexed:
        assert 0 <= int(doc.meta["vector_id"]) <= 7
Ejemplo n.º 4
0
    embed_title=True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=False,
    top_k_answers=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

# Delete existing documents in documents store
document_store.delete_all_documents()
# Write documents to document store
document_store.write_documents(documents)
# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)

# Now ask your questions
# We have some sample questions
QUESTIONS = [
    "who got the first nobel prize in physics",
    "when is the next deadpool movie being released",
    "which mode is used for short wave broadcast service",
    "who is the owner of reading football club",
    "when is the next scandal episode coming out",
    "when is the last time the philadelphia won the superbowl",
    "what is the most current adobe flash player version",
def tutorial7_rag_generator():
    # Add documents from which you want generate answers
    # Download a csv containing some sample documents data
    # Here some sample documents data
    temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
    open('small_generator_dataset.csv', 'wb').write(temp.content)

    # Get dataframe with columns "title", and "text"
    df = pd.read_csv("small_generator_dataset.csv", sep=',')
    # Minimal cleaning
    df.fillna(value="", inplace=True)

    print(df.head())

    titles = list(df["title"].values)
    texts = list(df["text"].values)

    # Create to haystack document format
    documents: List[Document] = []
    for title, text in zip(titles, texts):
        documents.append(
            Document(
                text=text,
                meta={
                    "name": title or ""
                }
            )
        )


    # Initialize FAISS document store to documents and corresponding index for embeddings
    # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat",
        return_embedding=True
    )

    # Initialize DPR Retriever to encode documents, encode question and query documents
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        embed_title=True,
    )

    # Initialize RAG Generator
    generator = RAGenerator(
        model_name_or_path="facebook/rag-token-nq",
        use_gpu=True,
        top_k_answers=1,
        max_length=200,
        min_length=2,
        embed_title=True,
        num_beams=2,
    )

    # Delete existing documents in documents store
    document_store.delete_all_documents()
    # Write documents to document store
    document_store.write_documents(documents)
    # Add documents embeddings to index
    document_store.update_embeddings(
        retriever=retriever
    )

    # Now ask your questions
    # We have some sample questions
    QUESTIONS = [
        "who got the first nobel prize in physics",
        "when is the next deadpool movie being released",
        "which mode is used for short wave broadcast service",
        "who is the owner of reading football club",
        "when is the next scandal episode coming out",
        "when is the last time the philadelphia won the superbowl",
        "what is the most current adobe flash player version",
        "how many episodes are there in dragon ball z",
        "what is the first step in the evolution of the eye",
        "where is gall bladder situated in human body",
        "what is the main mineral in lithium batteries",
        "who is the president of usa right now",
        "where do the greasers live in the outsiders",
        "panda is a national animal of which country",
        "what is the name of manchester united stadium",
    ]

    # Now generate answer for question
    for question in QUESTIONS:
        # Retrieve related documents from retriever
        retriever_results = retriever.retrieve(
            query=question
        )

        # Now generate answer from question and retrieved documents
        predicted_result = generator.predict(
            query=question,
            documents=retriever_results,
            top_k=1
        )

        # Print you answer
        answers = predicted_result["answers"]
        print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="passage",
    split_length=1,
    split_respect_sentence_boundary=False,
    split_overlap=0
)

as4Docs = processor.process(as4)

for i in range(len(as4Docs)):
    as4Docs[i]["meta"]["table"] = False

document_store_ES.delete_all_documents()
document_store_ES.write_documents(as4Docs)
document_store_FAISS.delete_all_documents()
document_store_FAISS.write_documents(as4Docs)

#backagain = document_store.get_all_documents();

for i in range(0, len(as4Docs)):
    print(str(i) + ":", end=" ")
    print(as4Docs[i])

# Update table content and table description - csv for table content, txt for table description
# The files are under the /tables directory, the name does not matter, as long as the csv and the txt match each other.
# Need to rerun this section when new table is uploaded.

import csv
import os