def main(): args = docopt(__doc__) data_dir = args["--data_dir"] if args["train"] : reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) reader.train(data_dir=data_dir, train_filename=args["--train_file_name"],dev_filename=args["--dev_file_name"],use_gpu=False, n_epochs=1, save_dir=args["--save_dir"],dev_split=0.05) if args["test"] : reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False) print(reader.eval_on_file(data_dir,args["--eval_file_name"],'cpu')) if args["cli"] : reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False) query_doc_list=[] for text_file in list(glob.glob(data_dir+'/*.txt')): with open(text_file,"r") as f: context=f.read() #context=context.split(".") context=[context] for i,para in enumerate(context): query_doc_list.append(Document(id=str(i),text=para)) while 1: question=input("CTRL C to exit >") prediction=reader.predict(question,query_doc_list) print("answer:>> ",prediction['answers'][0]['answer']) print("-----") print("context:>> ",prediction['answers'][0]['context']) print("-------------")
def tutorial2_finetune_a_model_on_your_data(): # ## Create Training Data # # There are two ways to generate training data # # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label # your data, i.e. highlighting answers to your questions in a document. The tool supports structuring # your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="my_model") # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model")
from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.retriever.tfidf import TfidfRetriever from haystack.utils import print_answers #### TRAINING ############# # Let's take a reader as a base model reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) # and fine-tune it on your own custom dataset (should be in SQuAD like format) train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="train.json", use_gpu=False, n_epochs=1) #### Use it (same as in Tutorial 1) ############# ## Indexing & cleaning documents # Let's get the data (Game of thrones articles from wikipedia) doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Init Document store & write docs to it document_store = SQLDocumentStore(url="sqlite:///qa.db") write_documents_to_db(document_store=document_store, document_dir=doc_dir,
# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label # your data, i.e. highlighting answers to your questions in a document. The tool supports structuring # your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="my_model") # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model")
from haystack import Finder from haystack.database.sql import SQLDocumentStore from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.retriever.tfidf import TfidfRetriever from haystack.utils import print_answers #### TRAINING ############# # Let's take a reader as a base model reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) # and fine-tune it on your own custom dataset (should be in SQuAD like format) train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="train.json", n_epochs=1) #### Use it (same as in Tutorial 1) ############# ## Indexing & cleaning documents # Let's get the data (Game of thrones articles from wikipedia) doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Init Document store & write docs to it document_store = SQLDocumentStore(url="sqlite:///qa.db") write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)
# your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # train_data = "data" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir="/home/bulelani/Desktop/odin/odin/src_new/data/training", train_filename="answers.json", use_gpu=False, n_epochs=1, save_dir="/home/bulelani/Desktop/odin/saved_models") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="/home/bulelani/Desktop/odin/saved_models") # If you want to load it at a later point, just do: # new_reader = FARMReader(model_name_or_path="my_model")
from haystack.reader.farm import FARMReader reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) train_data = "/home/bulelani/Desktop/odin/odin/src_new/data/training" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="demo.json", use_gpu=False, n_epochs=100, save_dir="/home/bulelani/Desktop/odin/my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="/home/bulelani/Desktop/odin/my_model")
from haystack.reader.farm import FARMReader #input directory of the labels answers.json file train_data = "/usr/src/app/data/squad20" # output directory of the model train_model = "/usr/src/app/data/train_model" reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) reader.train(data_dir=train_data, train_filename="answers.json", n_epochs=20, dev_split=0, save_dir=train_model) print('Training successfully completed')