def tutorial2_finetune_a_model_on_your_data(): # ## Create Training Data # # There are two ways to generate training data # # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label # your data, i.e. highlighting answers to your questions in a document. The tool supports structuring # your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="my_model") # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model")
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore(host=DB_HOST, port=DB_PORT, index=self.id) retriever = ElasticsearchRetriever(document_store=doc_store) reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=False, num_processes=MAX_PROCESSES, ) self.finder = Finder(reader, retriever) if add_sample_data: add_sample_data_doc_qa(self) reader.save(directory=READER_MODEL_PATH) print("saved")
@st.cache(allow_output_mutation=True) def retriever(): document_store = read_corpus() retriever = TfidfRetriever(document_store=document_store) return retriever question = st.text_input('Input your question here:') if st.button('Ask'): with st.spinner('Reading all the translations from all over Quran'): retriever = retriever() if not(path.exists('data/mlm-temp')): reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", use_gpu=False) reader.save(directory='data/mlm-temp') st.info('Downloaded Fresh Model') else: reader = FARMReader(model_name_or_path="data/mlm-temp", use_gpu=False) st.info('Re-Used Model') finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5) keys=['answer','context','meta','probability','score'] print(list( map(prediction.get, ['query']))) print("\n") answer_frame=pd.DataFrame.from_records([list( map(i.get, keys)) for i in prediction['answers']]) answer_frame.columns=['answer','reference','Surah','confidence','score'] answer_frame['Surah']=answer_frame['Surah']
# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label # your data, i.e. highlighting answers to your questions in a document. The tool supports structuring # your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="my_model") # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model")
# your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # train_data = "data" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir="/home/bulelani/Desktop/odin/odin/src_new/data/training", train_filename="answers.json", use_gpu=False, n_epochs=1, save_dir="/home/bulelani/Desktop/odin/saved_models") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="/home/bulelani/Desktop/odin/saved_models") # If you want to load it at a later point, just do: # new_reader = FARMReader(model_name_or_path="my_model")
from haystack.reader.farm import FARMReader reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) train_data = "/home/bulelani/Desktop/odin/odin/src_new/data/training" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="demo.json", use_gpu=False, n_epochs=100, save_dir="/home/bulelani/Desktop/odin/my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="/home/bulelani/Desktop/odin/my_model")