Python FARMReader.train Exemples, haystack.reader.farm.FARMReader.train Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    args        = docopt(__doc__)
    data_dir     = args["--data_dir"]
    if args["train"]    : 
        reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)
        reader.train(data_dir=data_dir, train_filename=args["--train_file_name"],dev_filename=args["--dev_file_name"],use_gpu=False, n_epochs=1, save_dir=args["--save_dir"],dev_split=0.05)
    if args["test"]     : 
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        print(reader.eval_on_file(data_dir,args["--eval_file_name"],'cpu'))
    if args["cli"]      :
        reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False)
        query_doc_list=[]
        for text_file in list(glob.glob(data_dir+'/*.txt')):
            with open(text_file,"r") as f:
                context=f.read()
            #context=context.split(".")
            context=[context]
            for i,para in enumerate(context):    
                query_doc_list.append(Document(id=str(i),text=para))
        while 1:  
            question=input("CTRL C to exit >")
            prediction=reader.predict(question,query_doc_list)
            print("answer:>> ",prediction['answers'][0]['answer']) 
            print("-----")
            print("context:>> ",prediction['answers'][0]['context'])
            print("-------------")

Exemple #2

0

Afficher le fichier

Fichier : Tutorial2_Finetune_a_model_on_your_data.py Projet : venuraja79/haystack

def tutorial2_finetune_a_model_on_your_data():
# ## Create Training Data
#
# There are two ways to generate training data
#
# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
# your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
# your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
# that is compatible for training with Haystack.
#
# 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's
# REST API interface. This includes a customizable user feedback API for providing feedback on the
# answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
# for fine-tuning your model further.
#
#
# ## Fine-tune your model
#
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.

#**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

reader = FARMReader(
model_name_or_path="distilbert-base-uncased-distilled-squad",
use_gpu=True)
train_data = "data/squad20"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
train_filename="dev-v2.0.json",
use_gpu=True,
n_epochs=1,
save_dir="my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="my_model")

# If you want to load it at a later point, just do:
new_reader = FARMReader(model_name_or_path="my_model")

Exemple #3

0

Afficher le fichier

Fichier : Tutorial2_Finetune_a_model_on_your_data.py Projet : zxlzr/haystack

from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers

#### TRAINING #############
# Let's take a reader as a base model
reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)

# and fine-tune it on your own custom dataset (should be in SQuAD like format)
train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
             train_filename="train.json",
             use_gpu=False,
             n_epochs=1)

#### Use it (same as in Tutorial 1) #############

## Indexing & cleaning documents

# Let's get the data (Game of thrones articles from wikipedia)
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Init Document store & write docs to it
document_store = SQLDocumentStore(url="sqlite:///qa.db")
write_documents_to_db(document_store=document_store,
                      document_dir=doc_dir,

Exemple #4

0

Afficher le fichier

Fichier : Tutorial2_Finetune_a_model_on_your_data.py Projet : tcapilla/nttextractiveqa

# 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label
# your data, i.e. highlighting answers to your questions in a document. The tool supports structuring
# your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
# that is compatible for training with Haystack.
#
# 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's
# REST API interface. This includes a customizable user feedback API for providing feedback on the
# answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data
# for fine-tuning your model further.
#
#
# ## Fine-tune your model
#
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.

#**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True`

reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
train_data = "data/squad20"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="my_model")

# If you want to load it at a later point, just do:
new_reader = FARMReader(model_name_or_path="my_model")

Exemple #5

0

Afficher le fichier

from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers

#### TRAINING #############
# Let's take a reader as a base model
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)

# and fine-tune it on your own custom dataset (should be in SQuAD like format)
train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data, train_filename="train.json", n_epochs=1)


#### Use it (same as in Tutorial 1) #############

## Indexing & cleaning documents

# Let's get the data (Game of thrones articles from wikipedia)
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)


# Init Document store & write docs to it
document_store = SQLDocumentStore(url="sqlite:///qa.db")
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)

Exemple #6

0

Afficher le fichier

# your workflow with organizations, projects, and users. The labels can be exported in SQuAD format
# that is compatible for training with Haystack.
#
# 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's
# REST API interface. This includes a customizable user feedback API for providing feedback on the
# answer returned by the API. The API provides feedback export endpoint to obtain the feedback data
# for fine-tuning your model further.
#
#
# ## Fine-tune your model
#
# Once you have collected training data, you can fine-tune your base models.
# We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format).
# We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer
# Learning effects.
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
use_gpu=False)
# train_data = "data"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir="/home/bulelani/Desktop/odin/odin/src_new/data/training",
train_filename="answers.json",
use_gpu=False,
n_epochs=1,
save_dir="/home/bulelani/Desktop/odin/saved_models")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/saved_models")

# If you want to load it at a later point, just do:
# new_reader = FARMReader(model_name_or_path="my_model")

Exemple #7

0

Afficher le fichier

from haystack.reader.farm import FARMReader

reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)
train_data = "/home/bulelani/Desktop/odin/odin/src_new/data/training"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data,
             train_filename="demo.json",
             use_gpu=False,
             n_epochs=100,
             save_dir="/home/bulelani/Desktop/odin/my_model")

# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="/home/bulelani/Desktop/odin/my_model")

Exemple #8

0

Afficher le fichier

from haystack.reader.farm import FARMReader

#input directory of the labels answers.json file
train_data = "/usr/src/app/data/squad20"
# output directory of the model
train_model = "/usr/src/app/data/train_model"

reader = FARMReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    use_gpu=False)

reader.train(data_dir=train_data,
             train_filename="answers.json",
             n_epochs=20,
             dev_split=0,
             save_dir=train_model)

print('Training successfully completed')