Ejemplo n.º 1
0
    def __init__(self,
                 reader=None,
                 retriever="bm25",
                 retrieve_by_doc=False,
                 **kwargs):

        if retriever not in RETRIEVERS:
            raise ValueError(
                "You provided a type of retriever that is not supported. " +
                "Please provide a retriver in the following list: " +
                str(list(RETRIEVERS.keys())))

        retriever_class = RETRIEVERS[retriever]

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in retriever_class.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True,
                                             **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False,
                                               **kwargs_processor)

        self.retriever = retriever_class(**kwargs_retriever)

        self.retrieve_by_doc = retrieve_by_doc

        if torch.cuda.is_available():
            self.cuda()
Ejemplo n.º 2
0
class QAPipeline(BaseEstimator):
    """
    A scikit-learn implementation of the whole cdQA pipeline

    Parameters
    ----------
    reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional

    retriever: "bm25" or "tfidf"
        The type of retriever

    retrieve_by_doc: bool (default: True). If Retriever will rank by documents
        or by paragraphs.

    kwargs: kwargs for BertQA(), BertProcessor(), TfidfRetriever() and BM25Retriever()
        Please check documentation for these classes

    Examples
    --------
    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline()
    >>> qa_pipeline.fit_reader('train-v1.1.json')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    """
    def __init__(self,
                 reader=None,
                 retriever="bm25",
                 retrieve_by_doc=False,
                 **kwargs):

        if retriever not in RETRIEVERS:
            raise ValueError(
                "You provided a type of retriever that is not supported. " +
                "Please provide a retriver in the following list: " +
                str(list(RETRIEVERS.keys())))

        retriever_class = RETRIEVERS[retriever]

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in retriever_class.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True,
                                             **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False,
                                               **kwargs_processor)

        self.retriever = retriever_class(**kwargs_retriever)

        self.retrieve_by_doc = retrieve_by_doc

        if torch.cuda.is_available():
            self.cuda()

    def fit_retriever(self, df: pd.DataFrame = None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.
        Parameters
        ----------
        df: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"
        """

        if self.retrieve_by_doc:
            self.metadata = df
            self.metadata["content"] = self.metadata["paragraphs"].apply(
                lambda x: " ".join(x))
        else:
            self.metadata = self._expand_paragraphs(df)

        self.retriever.fit(self.metadata)

        return self

    def fit_reader(self, data=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.

        Parameters
        ----------
        data: dict str-path to json file
             Annotated dataset in squad-like for Reader training

        """

        train_examples, train_features = self.processor_train.fit_transform(
            data)
        self.reader.fit(X=(train_examples, train_features))

        return self

    def predict(
        self,
        query: str = None,
        n_predictions: int = None,
        retriever_score_weight: float = 0.35,
        return_all_preds: bool = False,
    ):
        """ Compute prediction of an answer to a question

        Parameters
        ----------
        X: str
            Sample (question) to perform a prediction on

        n_predictions: int or None (default: None).
            Number of returned predictions. If None, only one prediction is return

        retriever_score_weight: float (default: 0.35).
            The weight of retriever score in the final score used for prediction.
            Given retriever score and reader average of start and end logits, the final score used for ranking is:

            final_score = retriever_score_weight * retriever_score + (1 - retriever_score_weight) * (reader_avg_logit)

        return_all_preds: boolean (default: False)
            whether to return a list of all predictions done by the Reader or not

        Returns
        -------
        if return_all_preds is False:
        prediction: tuple (answer, title, paragraph, score/logit)

        if return_all_preds is True:
        List of dictionnaries with all metadada of all answers outputted by the Reader
        given the question.

        """

        if not isinstance(query, str):
            raise TypeError(
                "The input is not a string. Please provide a string as input.")
        if not (isinstance(n_predictions, int) or n_predictions is None
                or n_predictions < 1):
            raise TypeError(
                "n_predictions should be a positive Integer or None")
        best_idx_scores = self.retriever.predict(query)
        squad_examples = generate_squad_examples(
            question=query,
            best_idx_scores=best_idx_scores,
            metadata=self.metadata,
            retrieve_by_doc=self.retrieve_by_doc,
        )
        examples, features = self.processor_predict.fit_transform(
            X=squad_examples)
        prediction = self.reader.predict(
            X=(examples, features),
            n_predictions=n_predictions,
            retriever_score_weight=retriever_score_weight,
            return_all_preds=return_all_preds,
        )
        return prediction

    def to(self, device):
        """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda'
        """
        if device not in ("cpu", "cuda"):
            raise ValueError("Attribute device should be 'cpu' or 'cuda'.")

        self.reader.model.to(device)
        self.reader.device = torch.device(device)
        return self

    def cpu(self):
        """ Send reader to CPU
        """
        self.reader.model.cpu()
        self.reader.device = torch.device("cpu")
        return self

    def cuda(self):
        """ Send reader to GPU
        """
        self.reader.model.cuda()
        self.reader.device = torch.device("cuda")
        return self

    def dump_reader(self, filename):
        """ Dump reader model to a .joblib object
        """
        joblib.dump(self.reader, filename)

    @staticmethod
    def _expand_paragraphs(df):
        # Snippet taken from: https://stackoverflow.com/a/48532692/11514226
        lst_col = "paragraphs"
        df = pd.DataFrame({
            col: np.repeat(df[col].values, df[lst_col].str.len())
            for col in df.columns.drop(lst_col)
        }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
        df["content"] = df["paragraphs"]
        return df.drop("paragraphs", axis=1)
Ejemplo n.º 3
0
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model
import torch
import joblib
import os

from cdqa.reader import BertProcessor, BertQA
from cdqa.utils.download import download_squad

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print ('Available devices ', torch.cuda.device_count())
# print ('Current cuda device ', torch.cuda.current_device())
# print(torch.cuda.get_device_name(device))

train_processor = BertProcessor(bert_model="bert-base-multilingual-cased",
                                do_lower_case=False,
                                is_training=True,
                                max_seq_length=384)
train_examples, train_features = train_processor.fit_transform(
    X='./dataset/KorQuAD_v1.0_train.json')
# eval_examples, eval_features = train_processor.transform(X='./dataset/KorQuAD_v1.0_dev.json')

reader = BertQA(
    bert_model=
    'bert-base-multilingual-cased',  # multilingual-cased를 써줘야 한국어가 잘 훈련된다(이거 안해줘서 f1-score안나왔음)
    train_batch_size=12,
    learning_rate=3e-5,
    num_train_epochs=5,
    do_lower_case=False,
    output_dir='models')

reader.fit(X=(train_examples, train_features))
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
Created on Wed Oct  2 10:11:58 2019

@author: Chacrew
"""
pip install cdqa

import os
import torch
import joblib
from cdqa.reader import BertProcessor, BertQA
from cdqa.utils.download import download_squad


train_processor = BertProcessor(do_lower_case=True, is_training=True, n_jobs=-1)


train_examples, train_features = train_processor.fit_transform(X='KorQuAD_v1.0_train.json')

reader = BertQA(train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
                do_lower_case=True,
                output_dir='/save')

reader.fit(X=(train_examples, train_features))

reader.model.to('cpu')
reader.device = torch.device('cpu')
Ejemplo n.º 5
0
import os
import torch
import joblib
from cdqa.reader import BertProcessor, BertQA
from cdqa.utils.download import download_squad
import time

start_time = time.time()

print(" \n\n Download SQuAD datasets")
download_squad(dir='./data')
print("--- %s seconds ---" % (time.time() - start_time))

print(" \n\n Preprocess SQuAD 1.1 examples")
train_processor = BertProcessor(do_lower_case=True, is_training=True)
# train_examples, train_features = train_processor.fit_transform(X='./data/SQuAD_1.1/train-v1.1.json')
train_examples, train_features = train_processor.fit_transform(
    X='./data/irs/train-v1.1.json')
print("--- %s seconds ---" % (time.time() - start_time))

print(" \n\n Train the model")
reader = BertQA(
    train_batch_size=1,
    # train_batch_size=12,
    learning_rate=3e-5,
    num_train_epochs=2,
    do_lower_case=True,
    output_dir='models')
reader.fit(X=(train_examples, train_features))
print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 6
0
class QAPipeline(BaseEstimator):
    """
    A scikit-learn implementation of the whole cdQA pipeline

    Parameters
    ----------
    metadata: pandas.DataFrame
        dataframe containing your corpus of documents metadata
        header should be of format: title, paragraphs.
    reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional
    retrieve_by_doc: bool (default: True). If Retriever will rank by documents
        or by paragraphs.
    kwargs: kwargs for BertQA(), BertProcessor(), TfidfRetriever() and BM25Retriever
        Please check documentation for these classes


    Examples
    --------
    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline()
    >>> qa_pipeline.fit_reader('train-v1.1.json')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    """
    def __init__(self,
                 reader=None,
                 retriever="bm25",
                 retrieve_by_doc=False,
                 **kwargs):

        if retriever not in RETRIEVERS:
            raise ValueError(
                "You provided a type of retriever that is not supported. " +
                "Please provide a retriver in the following list: " +
                str(list(RETRIEVERS.keys())))

        retriever_class = RETRIEVERS[retriever]

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in retriever_class.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True,
                                             **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False,
                                               **kwargs_processor)

        self.retriever = retriever_class(**kwargs_retriever)

        self.retrieve_by_doc = retrieve_by_doc

    def fit_retriever(self, X=None, y=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.
         Parameters
        ----------
        X: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"
        """

        if self.retrieve_by_doc:
            self.metadata = X
            self.metadata["content"] = self.metadata["paragraphs"].apply(
                lambda x: " ".join(x))
        else:
            self.metadata = self._expand_paragraphs(X)

        self.retriever.fit(self.metadata["content"])

        return self

    def fit_reader(self, X=None, y=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.

        Parameters
        ----------
        X: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"

        """

        train_examples, train_features = self.processor_train.fit_transform(X)
        self.reader.fit(X=(train_examples, train_features))

        return self

    def predict(self, X=None, return_logit=False, n_predictions=None):
        """ Compute prediction of an answer to a question

        Parameters
        ----------
        X: str or list of strings
            Sample (question) or list of samples to perform a prediction on

        return_logit: boolean
            Whether to return logit of best answer or not. Default: False

        Returns
        -------
        If X is str
        prediction: tuple (answer, title, paragraph)

        If X is list os strings
        predictions: list of tuples (answer, title, paragraph)

        If return_logits is True, each prediction tuple will have the following
        structure: (answer, title, paragraph, best logit)

        """
        if isinstance(X, str):
            closest_docs_indices = self.retriever.predict(
                X, metadata=self.metadata)
            squad_examples = generate_squad_examples(
                question=X,
                closest_docs_indices=closest_docs_indices,
                metadata=self.metadata,
                retrieve_by_doc=self.retrieve_by_doc,
            )
            examples, features = self.processor_predict.fit_transform(
                X=squad_examples)
            prediction = self.reader.predict((examples, features),
                                             return_logit, n_predictions)
            return prediction

        elif isinstance(X, list):
            predictions = []
            for query in X:
                closest_docs_indices = self.retriever.predict(
                    query, metadata=self.metadata)
                squad_examples = generate_squad_examples(
                    question=query,
                    closest_docs_indices=closest_docs_indices,
                    metadata=self.metadata,
                )
                examples, features = self.processor_predict.fit_transform(
                    X=squad_examples)
                pred = self.reader.predict((examples, features), return_logit,
                                           n_predictions)
                predictions.append(pred)

            return predictions

        else:
            raise TypeError("The input is not a string or a list. \
                            Please provide a string or a list of strings as input"
                            )

    def to(self, device):
        """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda'
        """
        if device not in ("cpu", "cuda"):
            raise ValueError("Attribute device should be 'cpu' or 'cuda'.")

        self.reader.model.to(device)
        self.reader.device = torch.device(device)
        return self

    def cpu(self):
        """ Send reader to CPU
        """
        self.reader.model.cpu()
        self.reader.device = torch.device("cpu")
        return self

    def cuda(self):
        """ Send reader to GPU
        """
        self.reader.model.cuda()
        self.reader.device = torch.device("cuda")
        return self

    def dump_reader(self, filename):
        """ Dump reader model to a .joblib object
        """
        joblib.dump(self.reader, filename)

    @staticmethod
    def _expand_paragraphs(df):
        # Snippet taken from: https://stackoverflow.com/a/48532692/11514226
        lst_col = "paragraphs"
        df = pd.DataFrame({
            col: np.repeat(df[col].values, df[lst_col].str.len())
            for col in df.columns.drop(lst_col)
        }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
        df["content"] = df["paragraphs"]
        return df.drop("paragraphs", axis=1)
Ejemplo n.º 7
0
import os
import torch
import joblib
from cdqa.reader import BertProcessor, BertQA
from cdqa.utils.download import download_squad

dataroot = "D:/datasets/sqad/"
download_squad(dir=dataroot)

train_processor = BertProcessor(do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(
    X=os.path.join(dataroot, "SQuAD_1.1", "train-v1.1.json"))

reader = BertQA(train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
                do_lower_case=True,
                output_dir='models')

reader.fit(X=(train_examples, train_features))

reader.model.to('cpu')
reader.device = torch.device('cpu')

joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa.joblib'))