Ejemplos de FAISSDocumentStore.update_embeddings en Python, ejemplos de haystack.document_store.faiss.FAISSDocumentStore.update_embeddings en Python

Ejemplo n.º 1

0

Mostrar archivo

def Find_answer(text_file_path, data_folder_path, symbol, question):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    with open(text_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    for i, line in enumerate(data.split(symbol)):
        with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f:
            print(f'writing file no.{i+1}')
            f.write(line)

    test_dicts = convert_files_to_dicts(dir_path=data_folder_path,
                                        clean_func=clean_wiki_text,
                                        split_paragraphs=True)
    document_store.write_documents(test_dicts)
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        context_window_size=300)

    pipe = ExtractiveQAPipeline(reader, retriever)

    prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3)

    doc_with_ans = []
    for i in range(len(prediction['answers'])):
        if prediction['answers'][i]['context'] not in doc_with_ans:
            doc_with_ans.append(prediction['answers'][i]['context'])

    answer = ' '.join(doc_with_ans)

    return answer

Ejemplo n.º 2

0

Mostrar archivo

Archivo: Tutorial7_RAG_Generator.py Proyecto: satrojan/haystack

generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=False,
    top_k_answers=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

# Delete existing documents in documents store
document_store.delete_all_documents()
# Write documents to document store
document_store.write_documents(documents)
# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)

# Now ask your questions
# We have some sample questions
QUESTIONS = [
    "who got the first nobel prize in physics",
    "when is the next deadpool movie being released",
    "which mode is used for short wave broadcast service",
    "who is the owner of reading football club",
    "when is the next scandal episode coming out",
    "when is the last time the philadelphia won the superbowl",
    "what is the most current adobe flash player version",
    "how many episodes are there in dragon ball z",
    "what is the first step in the evolution of the eye",
    "where is gall bladder situated in human body",
    "what is the main mineral in lithium batteries",

Ejemplo n.º 3

0

Mostrar archivo

Archivo: Tutorial7_RAG_Generator.py Proyecto: vijayashok99/haystack

def tutorial7_rag_generator():
    # Add documents from which you want generate answers
    # Download a csv containing some sample documents data
    # Here some sample documents data
    temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
    open('small_generator_dataset.csv', 'wb').write(temp.content)

    # Get dataframe with columns "title", and "text"
    df = pd.read_csv("small_generator_dataset.csv", sep=',')
    # Minimal cleaning
    df.fillna(value="", inplace=True)

    print(df.head())

    titles = list(df["title"].values)
    texts = list(df["text"].values)

    # Create to haystack document format
    documents: List[Document] = []
    for title, text in zip(titles, texts):
        documents.append(
            Document(
                text=text,
                meta={
                    "name": title or ""
                }
            )
        )


    # Initialize FAISS document store to documents and corresponding index for embeddings
    # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat",
        return_embedding=True
    )

    # Initialize DPR Retriever to encode documents, encode question and query documents
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        embed_title=True,
    )

    # Initialize RAG Generator
    generator = RAGenerator(
        model_name_or_path="facebook/rag-token-nq",
        use_gpu=True,
        top_k_answers=1,
        max_length=200,
        min_length=2,
        embed_title=True,
        num_beams=2,
    )

    # Delete existing documents in documents store
    document_store.delete_all_documents()
    # Write documents to document store
    document_store.write_documents(documents)
    # Add documents embeddings to index
    document_store.update_embeddings(
        retriever=retriever
    )

    # Now ask your questions
    # We have some sample questions
    QUESTIONS = [
        "who got the first nobel prize in physics",
        "when is the next deadpool movie being released",
        "which mode is used for short wave broadcast service",
        "who is the owner of reading football club",
        "when is the next scandal episode coming out",
        "when is the last time the philadelphia won the superbowl",
        "what is the most current adobe flash player version",
        "how many episodes are there in dragon ball z",
        "what is the first step in the evolution of the eye",
        "where is gall bladder situated in human body",
        "what is the main mineral in lithium batteries",
        "who is the president of usa right now",
        "where do the greasers live in the outsiders",
        "panda is a national animal of which country",
        "what is the name of manchester united stadium",
    ]

    # Now generate answer for question
    for question in QUESTIONS:
        # Retrieve related documents from retriever
        retriever_results = retriever.retrieve(
            query=question
        )

        # Now generate answer from question and retrieved documents
        predicted_result = generator.predict(
            query=question,
            documents=retriever_results,
            top_k=1
        )

        # Print you answer
        answers = predicted_result["answers"]
        print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')

Ejemplo n.º 4

0

Mostrar archivo

Archivo: Tutorial6_Better_Retrieval_via_DPR.py Proyecto: wolfram-roemhild-raumtext/haystack

def tutorial6_better_retrieval_via_dpr():
    # FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    # ## Preprocessing of documents
    # Let's first get some documents that we want to query
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    ### Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=2,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)

    ### Reader
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    ### Pipeline
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")

Ejemplo n.º 5

0

Mostrar archivo

class MLPipeline:
    def __init__(self):
        self.pipeline = None
        self.document_store = None
        self.document_store_faiss = None
        self.question_generator = None
        self.doc_lock = RLock()

    def write_as4_docs(self):
        converter = file_converter.txt.TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])

        processor = preprocessor.preprocessor.PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=False,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )

        self.document_store.delete_all_documents(index="document")
        self.document_store_faiss.delete_all_documents(index="document")

        for file in [file for file in os.listdir("knowledgeBase/text") if ".txt" in file]:
            doc = converter.convert(file_path="knowledgeBase/text/" + file)
            doc_processed = processor.process(doc)

            for i in range(len(doc_processed)):
                doc_processed[i]["meta"]["index"] = -1
                doc_processed[i]["meta"]["table"] = False
                doc_processed[i]["meta"]["name"] = file[:-4]

            self.document_store.write_documents(
                doc_processed, index="document")
            self.document_store_faiss.write_documents(
                doc_processed, index="document")

        backagain = self.document_store_faiss.get_all_documents()
        for i in range(0, len(backagain)):
            print(i)
            print(":\n")
            print(backagain[i])
            print("---------------")

        return (processor, converter)

    def write_table_docs(self, converter, processor):
        data = []
        docs = []

        for file in [file for file in os.listdir("knowledgeBase/tables") if ".csv" in file]:
            with open("knowledgeBase/tables/" + file, mode='r') as infile:
                reader = csv.reader(infile)
                new_dict = {row[0]: row[1:] for row in reader}
                data.append(new_dict)
            infile.close()
            with open("knowledgeBase/tables/" + file[:-4] + ".txt", mode='r') as infile:
                docs.append(infile.read())
            infile.close()

        with open('knowledgeBase/table_text.txt', 'w') as outfile:
            for item in docs:
                outfile.write("%s\n\n" % item)
        outfile.close()

        # Construct FAISS DocumentStore for table content

        tables = converter.convert(file_path="knowledgeBase/table_text.txt")

        tableDocs = processor.process(tables)
        for i in range(len(tableDocs)):
            tableDocs[i]["meta"]["index"] = i
            tableDocs[i]["meta"]["table"] = True
            tableDocs[i]["meta"]["name"] = "  "

        self.document_store.write_documents(tableDocs, index="document")
        self.document_store_faiss.write_documents(tableDocs, index="document")
        return data

    def re_process_documents(self):
        with self.doc_lock:
            self.setup()

    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")

    def answer(self, question, history={}):
        with self.doc_lock:
            if self.pipeline is None:
                return ""

            print(f"USING HISTORY: {history}")
            self.question_generator.history = history
            responses = self.pipeline.run(
                query=self.question_generator.question_parsing(question), top_k_retriever=20)
            if type(responses) is list:
                return responses[0]
            else:
                return responses

    def report(self, question):
        print("Question reported:")
        if self.trainer is None:
            print("Trainer is missing!")
            return []
        return self.trainer.processQuestion(question)

    def processTrainingAction(self, question, choices, correct_num):
        if self.trainer is None:
            print("Trainer is missing!")
            return 0
        return self.trainer.processTrainingAction(question, choices, correct_num)

Ejemplo n.º 6

0

Mostrar archivo

def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: haystack_preliminary_pipeline_v2.py Proyecto: rafilevy/virtual_agronomist

# Naive retriver based on tf * idf - Default BM25, can be cunstomised
from haystack.retriever.sparse import ElasticsearchRetriever
es_retriever = ElasticsearchRetriever(document_store=document_store_ES)

# Alternative retriever - double BERT neural networks for question and doc embedding
from haystack.retriever.dense import DensePassageRetriever
dpr_retriever = DensePassageRetriever(
    document_store=document_store_FAISS,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

document_store_FAISS.update_embeddings(
    retriever=dpr_retriever)  # possible training of dpr model

# Alternative retriever - single BERT to embed both question and doc, may be better for similar documents (our case)
from haystack.retriever.dense import EmbeddingRetriever
embedding_retriever = EmbeddingRetriever(document_store=document_store_FAISS,
                                         embedding_model="deepset/sentence_bert")

# Reader to further scan with Hugging Face models
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
# reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=True)

# Decide whether the answer should be retrieved from the tables or the general texts


class QueryClassifier:
    outgoing_edges = 2