def tutorial2_finetune_a_model_on_your_data():
    # ## Create Training Data
    #
    # There are two ways to generate training data
    #
    # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
    #                    your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
    #                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
    #                    that is compatible for training with Haystack.
    #
    # 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
    #                    REST API interface. This includes a customizable user feedback API for providing feedback on the
    #                    answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
    #                    for fine-tuning your model further.
    #
    #
    # ## Fine-tune your model
    #
    # Once you have collected training data, you can fine-tune your base models.
    # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
    # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
    # Learning effects.

    #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

    reader = FARMReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        use_gpu=True)
    train_data = "data/squad20"
    # train_data = "PATH/TO_YOUR/TRAIN_DATA"
    reader.train(data_dir=train_data,
                 train_filename="dev-v2.0.json",
                 use_gpu=True,
                 n_epochs=1,
                 save_dir="my_model")

    # Saving the model happens automatically at the end of training into the `save_dir` you specified
    # However, you could also save a reader manually again via:
    reader.save(directory="my_model")

    # If you want to load it at a later point, just do:
    new_reader = FARMReader(model_name_or_path="my_model")
Example #2
0
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(host=DB_HOST,
                                               port=DB_PORT,
                                               index=self.id)
        retriever = ElasticsearchRetriever(document_store=doc_store)

        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=False,
            num_processes=MAX_PROCESSES,
        )
        self.finder = Finder(reader, retriever)

        if add_sample_data:
            add_sample_data_doc_qa(self)

        reader.save(directory=READER_MODEL_PATH)
        print("saved")
Example #3
0
@st.cache(allow_output_mutation=True)
def retriever():
    document_store = read_corpus()
    retriever = TfidfRetriever(document_store=document_store)
    return retriever

question = st.text_input('Input your question here:')

if st.button('Ask'):
    with st.spinner('Reading all the translations from all over Quran'):
        retriever = retriever()
        
        if not(path.exists('data/mlm-temp')):
            reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", use_gpu=False)
            reader.save(directory='data/mlm-temp')
            st.info('Downloaded Fresh Model')
        else:
            reader = FARMReader(model_name_or_path="data/mlm-temp", use_gpu=False)
            st.info('Re-Used Model')
            
        finder = Finder(reader, retriever)
        
        prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5)

        keys=['answer','context','meta','probability','score']
        print(list( map(prediction.get, ['query'])))
        print("\n")
        answer_frame=pd.DataFrame.from_records([list( map(i.get, keys)) for i in prediction['answers']])
        answer_frame.columns=['answer','reference','Surah','confidence','score']
        answer_frame['Surah']=answer_frame['Surah']
# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
#                    your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
#                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
#                    that is compatible for training with Haystack.
# 
# 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
#                    REST API interface. This includes a customizable user feedback API for providing feedback on the
#                    answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
#                    for fine-tuning your model further.
# 
# 
# ## Fine-tune your model
# 
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.

#**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
train_data = "data/squad20"
# train_data = "PATH/TO_YOUR/TRAIN_DATA" 
reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="my_model")

# If you want to load it at a later point, just do:
new_reader = FARMReader(model_name_or_path="my_model")
Example #5
0
#                   your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
#                    that is compatible for training with Haystack.
#
# 2. **Feedback**:   For production systems, you can collect training data from direct user feedback via Haystack's
#                    REST API interface. This includes a customizable user feedback API for providing feedback on the
#                    answer returned by the API. The API provides feedback export endpoint to obtain the feedback data
#                    for fine-tuning your model further.
#
#
# ## Fine-tune your model
#
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=False)
# train_data = "data"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir="/home/bulelani/Desktop/odin/odin/src_new/data/training",
             train_filename="answers.json",
             use_gpu=False,
             n_epochs=1,
             save_dir="/home/bulelani/Desktop/odin/saved_models")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/saved_models")

# If you want to load it at a later point, just do:
# new_reader = FARMReader(model_name_or_path="my_model")
Example #6
0
from haystack.reader.farm import FARMReader

reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)
train_data = "/home/bulelani/Desktop/odin/odin/src_new/data/training"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
             train_filename="demo.json",
             use_gpu=False,
             n_epochs=100,
             save_dir="/home/bulelani/Desktop/odin/my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/my_model")