コード例 #1
0
ファイル: test_reader.py プロジェクト: shira07tech01/haystack
def test_context_window_size(test_docs_xs):
    # TODO parametrize window_size and farm/transformers reader using pytest
    docs = [
        Document.from_dict(d) if isinstance(d, dict) else d
        for d in test_docs_xs
    ]
    for window_size in [10, 15, 20]:
        farm_reader = FARMReader(
            model_name_or_path="distilbert-base-uncased-distilled-squad",
            num_processes=0,
            use_gpu=False,
            top_k_per_sample=5,
            no_ans_boost=None,
            context_window_size=window_size)
        prediction = farm_reader.predict(question="Who lives in Berlin?",
                                         documents=docs,
                                         top_k=5)
        for answer in prediction["answers"]:
            # If the extracted answer is larger than the context window, the context window is expanded.
            # If the extracted answer is odd in length, the resulting context window is one less than context_window_size
            # due to rounding (See FARM's QACandidate)
            # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
            if len(answer["answer"]) <= window_size:
                assert len(answer["context"]) in [window_size, window_size - 1]
            else:
                assert len(answer["answer"]) == len(answer["context"])
コード例 #2
0
def main():
    args        = docopt(__doc__)
    data_dir     = args["--data_dir"]
    if args["train"]    : 
        reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)
        reader.train(data_dir=data_dir, train_filename=args["--train_file_name"],dev_filename=args["--dev_file_name"],use_gpu=False, n_epochs=1, save_dir=args["--save_dir"],dev_split=0.05)
    if args["test"]     : 
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        print(reader.eval_on_file(data_dir,args["--eval_file_name"],'cpu'))
    if args["cli"]      :
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        query_doc_list=[]
        for text_file in list(glob.glob(data_dir+'/*.txt')):
            with open(text_file,"r") as f:
                context=f.read()
            #context=context.split(".")
            context=[context]
            for i,para in enumerate(context):    
                query_doc_list.append(Document(id=str(i),text=para))
        while 1:  
            question=input("CTRL C to exit >")
            prediction=reader.predict(question,query_doc_list)
            print("answer:>> ",prediction['answers'][0]['answer']) 
            print("-----")
            print("context:>> ",prediction['answers'][0]['context'])
            print("-------------")             
コード例 #3
0
def test_top_k(test_docs_xs):
    # TODO parametrize top_k and farm/transformers reader using pytest
    # TODO transformers reader was crashing when tested on this

    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
                             use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4)
    for top_k in [2, 5, 10]:
        prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
        assert len(prediction["answers"]) == top_k
コード例 #4
0
def farm_distilbert():
    return FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        use_gpu=False,
        top_k_per_sample=5,
        num_processes=0
    )
コード例 #5
0
ファイル: haystack_api.py プロジェクト: Accuro-Lab/Data-ML
def feed_documents_to_model(model_name="deepset/roberta-base-squad2-covid"):
    """Feeds documents to model and returns a model ready to make predictions

    Parameters
    ----------
    model_name : str
        The path of the model to be selected from HuggingFace
        By default uses the pretrained version of roBERTa in squad2
        and covid articles

    Returns
    -------
    finder
        the model to use for predictions
    """

    # Initialize in memory Document Store
    document_store = InMemoryDocumentStore()
    # Load articles and format it as dictionary
    articles = ret.get_data(MANIFEST, ARTICLES_FOLDER, [])
    dicts_textContent = process_documents(articles)
    # Store the dictionary with articles content in the Document Store
    document_store.write_documents(dicts_textContent)
    # Retriever chooses what is the subset of documents that are relevant
    # many techniques are possible: for dev purposes TfidfRetriever is faster
    retriever = TfidfRetriever(document_store=document_store)
    # Reader provides interface to use the pre trained transformers
    # by default we're using the roberta
    reader = FARMReader(model_name_or_path=model_name, use_gpu=False)
    # The finder retrieves predictions
    finder = Finder(reader, retriever)

    return finder
コード例 #6
0
def reader_without_normalized_scores():
    return FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        use_gpu=False,
        top_k_per_sample=5,
        num_processes=0,
        use_confidence_scores=False)
コード例 #7
0
def initFinder():
    """
    Function to initiate retriever, reader and finder
    Parameters
    ----------
    Returns
    -------
        finder (object): Haystack finder
    """
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
        embed_title=True,
        max_seq_len=256,
        batch_size=16,
        remove_sep_tok_from_untitled_passages=True)
    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=False)
    return Finder(reader, retriever)
コード例 #8
0
def init():
    ### Model values for Reader and Document Store
    global document_store, retriever, reader, finder
    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    retriever = ElasticsearchRetriever(document_store=document_store)
    reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-covid', use_gpu=False)
    finder = Finder(reader, retriever)
コード例 #9
0
ファイル: main.py プロジェクト: ewgdg/my-page
    def load(self):
        if(self.finder and self.finder2):
            return
        if(not self.document_store2):
            self.document_store2 = FAISSDocumentStore.load(
                sql_url=sqlUrlFAQ, faiss_file_path='faiss2')  # save before load in preprocess
            self.initSql(url=sqlUrlFAQ, document_store=self.document_store2)
        # else:  # reset session
        #     # self.document_store2.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ)
        if(not self.retriever2):
            self.retriever2 = EmbeddingRetriever(document_store=self.document_store2,
                                                 embedding_model="sentence_bert-saved", use_gpu=False)
        if(not self.finder2):
            self.finder2 = Finder(reader=None, retriever=self.retriever2)

        if(not self.document_store):
            self.document_store = SQLDocumentStore(url=sqlUrl)  
            #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl)
                                                          
            self.initSql(url=sqlUrl, document_store=self.document_store)
        # else:  # reset session
        #     # self.document_store.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store).__init__(url=sqlUrl)
        # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly??
        #     document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever
        if(not self.retriever):
            self.retriever = TfidfRetriever(document_store=self.document_store)
        self.reader = FARMReader(model_name_or_path=modelDir,
                                 use_gpu=False, no_ans_boost=0) if not self.reader else self.reader
        # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
        self.finder = Finder(
            self.reader, self.retriever) if not self.finder else self.finder
コード例 #10
0
def qna():
    """Return the n answers."""

    question = request.form['question']
    # index is the target document where queries need to sent.
    index = request.form['index']

    # to select train or untrained model
    mode = request.form['mode']

    #initialization of the Haystack Elasticsearch document storage
    document_store = ElasticsearchDocumentStore(
        host=app.config["host"],
        username=app.config["username"],
        password=app.config["password"],
        index=index)

    if mode == 'trained':
        # base on the search mode train_model
        reader = FARMReader(model_name_or_path=app.config["train_model"],
                            use_gpu=False)
    else:
        # base on the search mode pre_train
        reader = FARMReader(
            model_name_or_path="distilbert-base-uncased-distilled-squad",
            use_gpu=False)

    #initialization of ElasticRetriever
    retriever = ElasticsearchRetriever(document_store=document_store)
    # Finder sticks together reader and retriever
    # in a pipeline to answer our actual questions.
    finder = Finder(reader, retriever)

    # predict n answers
    n = int(request.form['n'])
    prediction = finder.get_answers(question=question,
                                    top_k_retriever=10,
                                    top_k_reader=n)
    answer = []
    for res in prediction['answers']:
        answer.append(res['answer'])

    return json.dumps({
        'status': 'success',
        'message': 'Process succesfully',
        'result': answer
    })
コード例 #11
0
ファイル: conftest.py プロジェクト: gradjitta/haystack
def no_answer_reader(request):
    if request.param == "farm":
        return FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                          use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0)
    if request.param == "transformers":
        return TransformersReader(model="deepset/roberta-base-squad2",
                                  tokenizer="deepset/roberta-base-squad2",
                                  use_gpu=-1, n_best_per_passage=5)
コード例 #12
0
ファイル: conftest.py プロジェクト: gradjitta/haystack
def reader(request):
    if request.param == "farm":
        return FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad",
                          use_gpu=False, top_k_per_sample=5, num_processes=0)
    if request.param == "transformers":
        return TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                  tokenizer="distilbert-base-uncased",
                                  use_gpu=-1)
コード例 #13
0
def farm_roberta():
    return FARMReader(
        model_name_or_path="deepset/roberta-base-squad2",
        use_gpu=False,
        top_k_per_sample=5,
        no_ans_boost=0,
        num_processes=0
    )
コード例 #14
0
ファイル: test_reader.py プロジェクト: gradjitta/haystack
def test_top_k(test_docs_xs):
    # TODO parametrize top_k and farm/transformers reader using pytest
    # TODO transformers reader was crashing when tested on this
    docs = []
    for d in test_docs_xs:
        doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"])
        docs.append(doc)
    farm_reader = FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        num_processes=0,
        use_gpu=False,
        top_k_per_sample=4,
        no_ans_boost=None,
        top_k_per_candidate=4)
    for top_k in [2, 5, 10]:
        prediction = farm_reader.predict(question="Who lives in Berlin?",
                                         documents=docs,
                                         top_k=top_k)
        assert len(prediction["answers"]) == top_k
コード例 #15
0
 def __init__(self):
     self.document_store = ElasticsearchDocumentStore(host="localhost",
                                                      username="",
                                                      password="",
                                                      index="document")
     self.retriever = ElasticsearchRetriever(
         document_store=self.document_store)
     self.reader = FARMReader(
         model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
     self.finder = Finder(self.reader, self.retriever)
     print('Ready')
コード例 #16
0
ファイル: pre_process.py プロジェクト: chaozy/QASystem
def process(document_store):

    logger = logging.getLogger(__name__)
    # # Connect to Elasticsearch
    # document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    #
    # # write the docs to the DB.
    # document_store.write_documents(file)
    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # Elasticsearch's default BM25 algorithm is used
    retriever = ElasticsearchRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=False,
                        context_window_size=500)

    # Some default pipes that can be chosen from
    # Extractive QA
    # qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever)

    # Document Search
    # doc_pipe = DocumentSearchPipeline(retriever=retriever)

    # Generative QA
    # doc_pipe = GenerativeQAPipeline(generator=rag_generator, retriever=retriever)

    # FAQ based QA
    # doc_pipe = FAQPipeline(retriever=retriever)

    # p = FAQPipeline(retriever=retriever)

    p = ExtractiveQAPipeline(reader, retriever)
    query_Handler.pipe = p
コード例 #17
0
ファイル: models.py プロジェクト: dimatrubca/ai-chatbot-api
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(host=DB_HOST,
                                               port=DB_PORT,
                                               index=self.id)
        retriever = ElasticsearchRetriever(document_store=doc_store)

        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=False,
            num_processes=MAX_PROCESSES,
        )
        self.finder = Finder(reader, retriever)

        if add_sample_data:
            add_sample_data_doc_qa(self)

        reader.save(directory=READER_MODEL_PATH)
        print("saved")
コード例 #18
0
ファイル: run.py プロジェクト: ZapAutomation/DocumentAnalysis
def get_results(txt_files_location, use_gpu, questions_list, results_location):

    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    for dirpath, dirnames, files in os.walk(txt_files_location):
        for dirname in dirnames:
            for dirpath, dirname, files in os.walk(
                    os.path.join(txt_files_location, dirname)):
                for file_name in files:
                    document_store.client.indices.delete(index='document',
                                                         ignore=[400, 404])

                    doc_dir = dirpath

                    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                                   clean_func=clean_wiki_text,
                                                   split_paragraphs=True)

                    document_store.write_documents(dicts)

                    retriever = ElasticsearchRetriever(
                        document_store=document_store)

                    reader = FARMReader(
                        model_name_or_path=
                        "elgeish/cs224n-squad2.0-albert-xxlarge-v1",
                        use_gpu=use_gpu)

                    finder = Finder(reader, retriever)

                    sys.stdout = open(
                        os.path.join(results_location,
                                     file_name[:-4] + "_results.txt"), "a+")

                    for i, question in enumerate(questions_list):

                        prediction = finder.get_answers(question=question,
                                                        top_k_retriever=10,
                                                        top_k_reader=1)

                        print("\n\n\nQuestion " + str(i + 1) + ":\n")
                        print(question + "\n")
                        print_answers(prediction, details="minimal")

                    sys.stdout.close()

    document_store.client.transport.close()
コード例 #19
0
    def qa(self, question, text_field):
        document_store = ElasticsearchDocumentStore(host=ES_HOST,
                                                    username=ES_USERNAME,
                                                    password=ES_PASSWORD,
                                                    index=self.ELASTIC_INDEX,
                                                    text_field=text_field)
        retriever = TfidfRetriever(document_store=document_store)

        reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                            use_gpu=False)

        finder = Finder(reader, retriever)
        prediction = finder.get_answers(question=question,
                                        top_k_retriever=1,
                                        top_k_reader=5)

        return prediction
コード例 #20
0
def Find_answer(text_file_path, data_folder_path, symbol, question):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    with open(text_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    for i, line in enumerate(data.split(symbol)):
        with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f:
            print(f'writing file no.{i+1}')
            f.write(line)

    test_dicts = convert_files_to_dicts(dir_path=data_folder_path,
                                        clean_func=clean_wiki_text,
                                        split_paragraphs=True)
    document_store.write_documents(test_dicts)
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        context_window_size=300)

    pipe = ExtractiveQAPipeline(reader, retriever)

    prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3)

    doc_with_ans = []
    for i in range(len(prediction['answers'])):
        if prediction['answers'][i]['context'] not in doc_with_ans:
            doc_with_ans.append(prediction['answers'][i]['context'])

    answer = ' '.join(doc_with_ans)

    return answer
コード例 #21
0
def tutorial2_finetune_a_model_on_your_data():
    # ## Create Training Data
    #
    # There are two ways to generate training data
    #
    # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
    #                    your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
    #                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
    #                    that is compatible for training with Haystack.
    #
    # 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
    #                    REST API interface. This includes a customizable user feedback API for providing feedback on the
    #                    answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
    #                    for fine-tuning your model further.
    #
    #
    # ## Fine-tune your model
    #
    # Once you have collected training data, you can fine-tune your base models.
    # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
    # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
    # Learning effects.

    #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

    reader = FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        use_gpu=True)
    train_data = "data/squad20"
    # train_data = "PATH/TO_YOUR/TRAIN_DATA"
    reader.train(data_dir=train_data,
                 train_filename="dev-v2.0.json",
                 use_gpu=True,
                 n_epochs=1,
                 save_dir="my_model")

    # Saving the model happens automatically at the end of training into the `save_dir` you specified
    # However, you could also save a reader manually again via:
    reader.save(directory="my_model")

    # If you want to load it at a later point, just do:
    new_reader = FARMReader(model_name_or_path="my_model")
コード例 #22
0
def create_app(config_name):
    app = Flask(__name__)
    app.config.from_object(config[config_name])
    host = config[config_name].ELASTIC_URL
    port = config[config_name].ELASTIC_PORT
    index = config[config_name].ELASTIC_INDEX

    doc_store = ElasticsearchDocumentStore(host=host,
                                           username='',
                                           password='',
                                           index=index)

    retriever = ElasticsearchRetriever(document_store=doc_store)
    model_name = "deepset/roberta-base-squad2"
    reader = FARMReader(model_name_or_path=model_name,
                        num_processes=0,
                        use_gpu=False)
    app.finder = Finder(reader, retriever)

    from app.main import main as main_blueprint
    app.register_blueprint(main_blueprint)

    return app
コード例 #23
0
ファイル: model.py プロジェクト: VictorMadu/COVID-QA
# english_retriever
english_retriever = ElasticsearchRetriever(
    document_store=document_store,
    embedding_model=EMBEDDING_MODEL_PATH,
    gpu=USE_GPU,
    pooling_strategy=EMBEDDING_POOLING_STRATEGY,
    emb_extraction_layer=EMBEDDING_EXTRACTION_LAYER)

if READER_MODEL_PATH:
    # needed for extractive QA
    reader = FARMReader(
        model_name_or_path=str(READER_MODEL_PATH),
        batch_size=BATCHSIZE,
        use_gpu=USE_GPU,
        context_window_size=CONTEXT_WINDOW_SIZE,
        top_k_per_candidate=TOP_K_PER_CANDIDATE,
        no_ans_boost=NO_ANS_BOOST,
        max_processes=MAX_PROCESSES,
        max_seq_len=MAX_SEQ_LEN,
        doc_stride=DOC_STRIDE,
    )
else:
    # don't need one for pure FAQ matching
    reader = None

FINDERS = {
    1: Finder(reader=reader, retriever=retriever),
    2: Finder(reader=reader, retriever=english_retriever)
}

コード例 #24
0
    return document_store

@st.cache(allow_output_mutation=True)
def retriever():
    document_store = read_corpus()
    retriever = TfidfRetriever(document_store=document_store)
    return retriever

question = st.text_input('Input your question here:')

if st.button('Ask'):
    with st.spinner('Reading all the translations from all over Quran'):
        retriever = retriever()
        
        if not(path.exists('data/mlm-temp')):
            reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", use_gpu=False)
            reader.save(directory='data/mlm-temp')
            st.info('Downloaded Fresh Model')
        else:
            reader = FARMReader(model_name_or_path="data/mlm-temp", use_gpu=False)
            st.info('Re-Used Model')
            
        finder = Finder(reader, retriever)
        
        prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5)

        keys=['answer','context','meta','probability','score']
        print(list( map(prediction.get, ['query'])))
        print("\n")
        answer_frame=pd.DataFrame.from_records([list( map(i.get, keys)) for i in prediction['answers']])
        answer_frame.columns=['answer','reference','Surah','confidence','score']
コード例 #25
0
# from haystack.retriever.dense import EmbeddingRetriever
# retriever = EmbeddingRetriever(document_store=document_store,
#                                embedding_model="deepset/sentence_bert",
#                                model_format="farm")

# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
# reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
# reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2", use_gpu=True)
# reader = FARMReader(model_name_or_path="roberta-large", use_gpu=True)
# reader = FARMReader(model_name_or_path="csarron/mobilebert-uncased-squad-v2", use_gpu=True)
# reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True)

# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
# reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True)
reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2",
                    use_gpu=True)
# reader = FARMReader(model_name_or_path="ktrapeznikov/albert-xlarge-v2-squad-v2", use_gpu=True)

finder = Finder(reader, retriever)

prediction = finder.get_answers(question="Who is the father of Arya Stark?",
                                top_k_retriever=40,
                                top_k_reader=5)

print_answers(prediction, details="minimal")

# print("\n\n")
# print(prediction)
コード例 #26
0
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False)
# Add evaluation data to Elasticsearch database
if LAUNCH_ELASTICSEARCH:
    document_store.add_eval_data("../data/nq/nq_dev_subset.json")
else:
    logger.warning("Since we already have a running ES instance we should not index the same documents again."
                   "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ")

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize Reader
reader = FARMReader("deepset/roberta-base-squad2")

# Initialize Finder which sticks together Reader and Retriever
finder = Finder(reader, retriever)


## Evaluate Retriever on its own
if eval_retriever_only:
    retriever_eval_results = retriever.eval()
    ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
    ## among the correct documents
    print("Retriever Recall:", retriever_eval_results["recall"])
    ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
    print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

# Evaluate Reader on its own
コード例 #27
0
### Retriever
retriever = DensePassageRetriever(document_store=document_store,
                                  embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True,
                                  use_gpu=True)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True)

### Finder
# The Finder sticks together reader and retriever in a pipeline to answer our actual questions.
finder = Finder(reader, retriever)

### Voilà! Ask a question!
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = finder.get_answers(question="Who is the father of Arya Stark?",
                                top_k_retriever=10,
                                top_k_reader=5)

# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)
コード例 #28
0
write_documents_to_db(document_store=document_store,
                      document_dir=doc_dir,
                      clean_func=clean_wiki_text,
                      only_empty_db=True)

## Initalize Reader, Retriever & Finder

# A retriever identifies the k most promising chunks of text that might contain the answer for our question
# Retrievers use some simple but fast algorithm, here: TF-IDF
retriever = TfidfRetriever(document_store=document_store)

# A reader scans the text chunks in detail and extracts the k best answers
# Reader use more powerful but slower deep learning models
# You can select a local model or  any of the QA models published on huggingface's model hub (https://huggingface.co/models)
# here: a medium sized BERT QA model trained via FARM on Squad 2.0
reader = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2",
                    use_gpu=False)

# OR: use alternatively a reader from huggingface's transformers package (https://github.com/huggingface/transformers)
# reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions
finder = Finder(reader, retriever)

## Voilá! Ask a question!
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = finder.get_answers(question="Who is the father of Arya Stark?",
                                top_k_retriever=10,
                                top_k_reader=5)

#prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
コード例 #29
0
def tutorial3_basic_qa_pipeline_without_elasticsearch():
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # or, alternatively, SQLite Document Store
    # document_store = SQLDocumentStore(url="sqlite:///qa.db")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
    # them in Elasticsearch.
    # Let's first get some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where
    # a given question could be answered.
    #
    # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more
    # retrievers, please refer to the tutorial-1.

    # An in-memory TfidfRetriever based on Pandas dataframes
    retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

    # **Here:**                   a medium sized RoBERTa QA model using a Reader based on
    #                             FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):**  TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):**  e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                             "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:**                   You can adjust the model to return "no answer possible" with the no_ans_boost.
    #                             Higher values mean the model prefers "no answer possible".

    # #### FARMReader
    #
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader
    # Alternative:
    # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
コード例 #30
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")