def __init__(self, reader=None, retriever="bm25", retrieve_by_doc=False, **kwargs): if retriever not in RETRIEVERS: raise ValueError( "You provided a type of retriever that is not supported. " + "Please provide a retriver in the following list: " + str(list(RETRIEVERS.keys()))) retriever_class = RETRIEVERS[retriever] # Separating kwargs kwargs_bertqa = { key: value for key, value in kwargs.items() if key in BertQA.__init__.__code__.co_varnames } kwargs_processor = { key: value for key, value in kwargs.items() if key in BertProcessor.__init__.__code__.co_varnames } kwargs_retriever = { key: value for key, value in kwargs.items() if key in retriever_class.__init__.__code__.co_varnames } if not reader: self.reader = BertQA(**kwargs_bertqa) elif type(reader) == str: self.reader = joblib.load(reader) else: self.reader = reader self.processor_train = BertProcessor(is_training=True, **kwargs_processor) self.processor_predict = BertProcessor(is_training=False, **kwargs_processor) self.retriever = retriever_class(**kwargs_retriever) self.retrieve_by_doc = retrieve_by_doc if torch.cuda.is_available(): self.cuda()
class QAPipeline(BaseEstimator): """ A scikit-learn implementation of the whole cdQA pipeline Parameters ---------- reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional retriever: "bm25" or "tfidf" The type of retriever retrieve_by_doc: bool (default: True). If Retriever will rank by documents or by paragraphs. kwargs: kwargs for BertQA(), BertProcessor(), TfidfRetriever() and BM25Retriever() Please check documentation for these classes Examples -------- >>> from cdqa.pipeline import QAPipeline >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') >>> from cdqa.pipeline import QAPipeline >>> qa_pipeline = QAPipeline() >>> qa_pipeline.fit_reader('train-v1.1.json') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') """ def __init__(self, reader=None, retriever="bm25", retrieve_by_doc=False, **kwargs): if retriever not in RETRIEVERS: raise ValueError( "You provided a type of retriever that is not supported. " + "Please provide a retriver in the following list: " + str(list(RETRIEVERS.keys()))) retriever_class = RETRIEVERS[retriever] # Separating kwargs kwargs_bertqa = { key: value for key, value in kwargs.items() if key in BertQA.__init__.__code__.co_varnames } kwargs_processor = { key: value for key, value in kwargs.items() if key in BertProcessor.__init__.__code__.co_varnames } kwargs_retriever = { key: value for key, value in kwargs.items() if key in retriever_class.__init__.__code__.co_varnames } if not reader: self.reader = BertQA(**kwargs_bertqa) elif type(reader) == str: self.reader = joblib.load(reader) else: self.reader = reader self.processor_train = BertProcessor(is_training=True, **kwargs_processor) self.processor_predict = BertProcessor(is_training=False, **kwargs_processor) self.retriever = retriever_class(**kwargs_retriever) self.retrieve_by_doc = retrieve_by_doc if torch.cuda.is_available(): self.cuda() def fit_retriever(self, df: pd.DataFrame = None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- df: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ if self.retrieve_by_doc: self.metadata = df self.metadata["content"] = self.metadata["paragraphs"].apply( lambda x: " ".join(x)) else: self.metadata = self._expand_paragraphs(df) self.retriever.fit(self.metadata) return self def fit_reader(self, data=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- data: dict str-path to json file Annotated dataset in squad-like for Reader training """ train_examples, train_features = self.processor_train.fit_transform( data) self.reader.fit(X=(train_examples, train_features)) return self def predict( self, query: str = None, n_predictions: int = None, retriever_score_weight: float = 0.35, return_all_preds: bool = False, ): """ Compute prediction of an answer to a question Parameters ---------- X: str Sample (question) to perform a prediction on n_predictions: int or None (default: None). Number of returned predictions. If None, only one prediction is return retriever_score_weight: float (default: 0.35). The weight of retriever score in the final score used for prediction. Given retriever score and reader average of start and end logits, the final score used for ranking is: final_score = retriever_score_weight * retriever_score + (1 - retriever_score_weight) * (reader_avg_logit) return_all_preds: boolean (default: False) whether to return a list of all predictions done by the Reader or not Returns ------- if return_all_preds is False: prediction: tuple (answer, title, paragraph, score/logit) if return_all_preds is True: List of dictionnaries with all metadada of all answers outputted by the Reader given the question. """ if not isinstance(query, str): raise TypeError( "The input is not a string. Please provide a string as input.") if not (isinstance(n_predictions, int) or n_predictions is None or n_predictions < 1): raise TypeError( "n_predictions should be a positive Integer or None") best_idx_scores = self.retriever.predict(query) squad_examples = generate_squad_examples( question=query, best_idx_scores=best_idx_scores, metadata=self.metadata, retrieve_by_doc=self.retrieve_by_doc, ) examples, features = self.processor_predict.fit_transform( X=squad_examples) prediction = self.reader.predict( X=(examples, features), n_predictions=n_predictions, retriever_score_weight=retriever_score_weight, return_all_preds=return_all_preds, ) return prediction def to(self, device): """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda' """ if device not in ("cpu", "cuda"): raise ValueError("Attribute device should be 'cpu' or 'cuda'.") self.reader.model.to(device) self.reader.device = torch.device(device) return self def cpu(self): """ Send reader to CPU """ self.reader.model.cpu() self.reader.device = torch.device("cpu") return self def cuda(self): """ Send reader to GPU """ self.reader.model.cuda() self.reader.device = torch.device("cuda") return self def dump_reader(self, filename): """ Dump reader model to a .joblib object """ joblib.dump(self.reader, filename) @staticmethod def _expand_paragraphs(df): # Snippet taken from: https://stackoverflow.com/a/48532692/11514226 lst_col = "paragraphs" df = pd.DataFrame({ col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col) }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns] df["content"] = df["paragraphs"] return df.drop("paragraphs", axis=1)
from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model import torch import joblib import os from cdqa.reader import BertProcessor, BertQA from cdqa.utils.download import download_squad device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # print ('Available devices ', torch.cuda.device_count()) # print ('Current cuda device ', torch.cuda.current_device()) # print(torch.cuda.get_device_name(device)) train_processor = BertProcessor(bert_model="bert-base-multilingual-cased", do_lower_case=False, is_training=True, max_seq_length=384) train_examples, train_features = train_processor.fit_transform( X='./dataset/KorQuAD_v1.0_train.json') # eval_examples, eval_features = train_processor.transform(X='./dataset/KorQuAD_v1.0_dev.json') reader = BertQA( bert_model= 'bert-base-multilingual-cased', # multilingual-cased를 써줘야 한국어가 잘 훈련된다(이거 안해줘서 f1-score안나왔음) train_batch_size=12, learning_rate=3e-5, num_train_epochs=5, do_lower_case=False, output_dir='models') reader.fit(X=(train_examples, train_features))
# -*- coding: utf-8 -*- """ Created on Wed Oct 2 10:11:58 2019 @author: Chacrew """ pip install cdqa import os import torch import joblib from cdqa.reader import BertProcessor, BertQA from cdqa.utils.download import download_squad train_processor = BertProcessor(do_lower_case=True, is_training=True, n_jobs=-1) train_examples, train_features = train_processor.fit_transform(X='KorQuAD_v1.0_train.json') reader = BertQA(train_batch_size=12, learning_rate=3e-5, num_train_epochs=2, do_lower_case=True, output_dir='/save') reader.fit(X=(train_examples, train_features)) reader.model.to('cpu') reader.device = torch.device('cpu')
import os import torch import joblib from cdqa.reader import BertProcessor, BertQA from cdqa.utils.download import download_squad import time start_time = time.time() print(" \n\n Download SQuAD datasets") download_squad(dir='./data') print("--- %s seconds ---" % (time.time() - start_time)) print(" \n\n Preprocess SQuAD 1.1 examples") train_processor = BertProcessor(do_lower_case=True, is_training=True) # train_examples, train_features = train_processor.fit_transform(X='./data/SQuAD_1.1/train-v1.1.json') train_examples, train_features = train_processor.fit_transform( X='./data/irs/train-v1.1.json') print("--- %s seconds ---" % (time.time() - start_time)) print(" \n\n Train the model") reader = BertQA( train_batch_size=1, # train_batch_size=12, learning_rate=3e-5, num_train_epochs=2, do_lower_case=True, output_dir='models') reader.fit(X=(train_examples, train_features)) print("--- %s seconds ---" % (time.time() - start_time))
class QAPipeline(BaseEstimator): """ A scikit-learn implementation of the whole cdQA pipeline Parameters ---------- metadata: pandas.DataFrame dataframe containing your corpus of documents metadata header should be of format: title, paragraphs. reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional retrieve_by_doc: bool (default: True). If Retriever will rank by documents or by paragraphs. kwargs: kwargs for BertQA(), BertProcessor(), TfidfRetriever() and BM25Retriever Please check documentation for these classes Examples -------- >>> from cdqa.pipeline import QAPipeline >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') >>> from cdqa.pipeline import QAPipeline >>> qa_pipeline = QAPipeline() >>> qa_pipeline.fit_reader('train-v1.1.json') >>> qa_pipeline.fit_retriever(X=df) >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?') """ def __init__(self, reader=None, retriever="bm25", retrieve_by_doc=False, **kwargs): if retriever not in RETRIEVERS: raise ValueError( "You provided a type of retriever that is not supported. " + "Please provide a retriver in the following list: " + str(list(RETRIEVERS.keys()))) retriever_class = RETRIEVERS[retriever] # Separating kwargs kwargs_bertqa = { key: value for key, value in kwargs.items() if key in BertQA.__init__.__code__.co_varnames } kwargs_processor = { key: value for key, value in kwargs.items() if key in BertProcessor.__init__.__code__.co_varnames } kwargs_retriever = { key: value for key, value in kwargs.items() if key in retriever_class.__init__.__code__.co_varnames } if not reader: self.reader = BertQA(**kwargs_bertqa) elif type(reader) == str: self.reader = joblib.load(reader) else: self.reader = reader self.processor_train = BertProcessor(is_training=True, **kwargs_processor) self.processor_predict = BertProcessor(is_training=False, **kwargs_processor) self.retriever = retriever_class(**kwargs_retriever) self.retrieve_by_doc = retrieve_by_doc def fit_retriever(self, X=None, y=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- X: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ if self.retrieve_by_doc: self.metadata = X self.metadata["content"] = self.metadata["paragraphs"].apply( lambda x: " ".join(x)) else: self.metadata = self._expand_paragraphs(X) self.retriever.fit(self.metadata["content"]) return self def fit_reader(self, X=None, y=None): """ Fit the QAPipeline retriever to a list of documents in a dataframe. Parameters ---------- X: pandas.Dataframe Dataframe with the following columns: "title", "paragraphs" """ train_examples, train_features = self.processor_train.fit_transform(X) self.reader.fit(X=(train_examples, train_features)) return self def predict(self, X=None, return_logit=False, n_predictions=None): """ Compute prediction of an answer to a question Parameters ---------- X: str or list of strings Sample (question) or list of samples to perform a prediction on return_logit: boolean Whether to return logit of best answer or not. Default: False Returns ------- If X is str prediction: tuple (answer, title, paragraph) If X is list os strings predictions: list of tuples (answer, title, paragraph) If return_logits is True, each prediction tuple will have the following structure: (answer, title, paragraph, best logit) """ if isinstance(X, str): closest_docs_indices = self.retriever.predict( X, metadata=self.metadata) squad_examples = generate_squad_examples( question=X, closest_docs_indices=closest_docs_indices, metadata=self.metadata, retrieve_by_doc=self.retrieve_by_doc, ) examples, features = self.processor_predict.fit_transform( X=squad_examples) prediction = self.reader.predict((examples, features), return_logit, n_predictions) return prediction elif isinstance(X, list): predictions = [] for query in X: closest_docs_indices = self.retriever.predict( query, metadata=self.metadata) squad_examples = generate_squad_examples( question=query, closest_docs_indices=closest_docs_indices, metadata=self.metadata, ) examples, features = self.processor_predict.fit_transform( X=squad_examples) pred = self.reader.predict((examples, features), return_logit, n_predictions) predictions.append(pred) return predictions else: raise TypeError("The input is not a string or a list. \ Please provide a string or a list of strings as input" ) def to(self, device): """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda' """ if device not in ("cpu", "cuda"): raise ValueError("Attribute device should be 'cpu' or 'cuda'.") self.reader.model.to(device) self.reader.device = torch.device(device) return self def cpu(self): """ Send reader to CPU """ self.reader.model.cpu() self.reader.device = torch.device("cpu") return self def cuda(self): """ Send reader to GPU """ self.reader.model.cuda() self.reader.device = torch.device("cuda") return self def dump_reader(self, filename): """ Dump reader model to a .joblib object """ joblib.dump(self.reader, filename) @staticmethod def _expand_paragraphs(df): # Snippet taken from: https://stackoverflow.com/a/48532692/11514226 lst_col = "paragraphs" df = pd.DataFrame({ col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col) }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns] df["content"] = df["paragraphs"] return df.drop("paragraphs", axis=1)
import os import torch import joblib from cdqa.reader import BertProcessor, BertQA from cdqa.utils.download import download_squad dataroot = "D:/datasets/sqad/" download_squad(dir=dataroot) train_processor = BertProcessor(do_lower_case=True, is_training=True) train_examples, train_features = train_processor.fit_transform( X=os.path.join(dataroot, "SQuAD_1.1", "train-v1.1.json")) reader = BertQA(train_batch_size=12, learning_rate=3e-5, num_train_epochs=2, do_lower_case=True, output_dir='models') reader.fit(X=(train_examples, train_features)) reader.model.to('cpu') reader.device = torch.device('cpu') joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa.joblib'))