Example #1
0
class QAPipeline(BaseEstimator):
    """
    A scikit-learn implementation of the whole cdQA pipeline

    Parameters
    ----------
    metadata: pandas.DataFrame
        dataframe containing your corpus of documents metadata
        header should be of format: title, paragraphs.
    reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional
    bert_version: str
        Bert pre-trained model selected in the list: bert-base-uncased,
        bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
        bert-base-multilingual-cased, bert-base-chinese.
    kwargs: kwargs for BertQA(), BertProcessor() and TfidfRetriever()
        Please check documentation for these classes


    Examples
    --------
    >>> from cdqa.pipeline.qa_pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib')
    >>> qa_pipeline.fit(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    >>> from cdqa.pipeline.qa_pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline()
    >>> qa_pipeline.fit_reader('train-v1.1.json')
    >>> qa_pipeline.fit(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    """
    def __init__(self, reader=None, **kwargs):

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in TfidfRetriever.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True,
                                             **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False,
                                               **kwargs_processor)

        self.retriever = TfidfRetriever(**kwargs_retriever)

    def fit(self, X=None, y=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.

        Parameters
        ----------
        X: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"

        """

        self.metadata = X
        self.metadata['content'] = self.metadata['paragraphs'].apply(
            lambda x: ' '.join(x))
        self.retriever.fit(self.metadata['content'])

        return self

    def fit_reader(self, X=None, y=None):
        """Train the reader (BertQA instance) of QAPipeline object

        Parameters
        ----------
        X = path to json file in SQUAD format

        """

        train_examples, train_features = self.processor_train.fit_transform(X)
        self.reader.fit(X=(train_examples, train_features))

        return self

    def predict(self, X=None):
        """ Compute prediction of an answer to a question

        Parameters
        ----------
        X: str or list of strings
            Sample (question) or list of samples to perform a prediction on

        Returns
        -------
        If X is str
        prediction: tuple (answer, title, paragraph)

        If X is list os strings
        predictions: list of tuples (answer, title, paragraph)

        """
        if (isinstance(X, str)):
            closest_docs_indices = self.retriever.predict(
                X, metadata=self.metadata)
            squad_examples = generate_squad_examples(
                question=X,
                closest_docs_indices=closest_docs_indices,
                metadata=self.metadata)
            examples, features = self.processor_predict.fit_transform(
                X=squad_examples)
            prediction = self.reader.predict((examples, features))
            return prediction

        elif (isinstance(X, list)):
            predictions = []
            for query in X:
                closest_docs_indices = self.retriever.predict(
                    query, metadata=self.metadata)
                squad_examples = generate_squad_examples(
                    question=query,
                    closest_docs_indices=closest_docs_indices,
                    metadata=self.metadata)
                examples, features = self.processor_predict.fit_transform(
                    X=squad_examples)
                pred = self.reader.predict((examples, features))
                predictions.append(pred)

            return predictions

        else:
            raise TypeError("The input is not a string or a list. \
                            Please provide a string or a list of strings as input"
                            )

    def to(self, device):
        ''' Send reader to CPU if device=='cpu' or to GPU if device=='cuda'
        '''
        if device not in ('cpu', 'cuda'):
            raise ValueError("Attribure device should be 'cpu' or 'cuda'.")

        self.reader.model.to(device)
        self.reader.device = torch.device(device)
        return self

    def cpu(self):
        ''' Send reader to CPU
        '''
        self.reader.model.cpu()
        self.reader.device = torch.device('cpu')
        return self

    def cuda(self):
        ''' Send reader to GPU
        '''
        self.reader.model.cuda()
        self.reader.device = torch.device('cuda')
        return self
Example #2
0
class QAPipeline(BaseEstimator):
    """
    A scikit-learn implementation of the whole cdQA pipeline

    Parameters
    ----------
    metadata: pandas.DataFrame
        dataframe containing your corpus of documents metadata
        header should be of format: title, paragraphs.
    reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional
    retrieve_by_doc: bool (default: False). If Retriever will rank by documents
        or by paragraphs.
    kwargs: kwargs for BertQA(), BertProcessor() and TfidfRetriever()
        Please check documentation for these classes


    Examples
    --------
    >>> from cdqa.pipeline.qa_pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    >>> from cdqa.pipeline.qa_pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline()
    >>> qa_pipeline.fit_reader('train-v1.1.json')
    >>> qa_pipeline.fit_retriever(X=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')

    """
    def __init__(self, reader=None, retrieve_by_doc=True, **kwargs):

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in TfidfRetriever.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True,
                                             **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False,
                                               **kwargs_processor)

        self.retriever = TfidfRetriever(**kwargs_retriever)

        self.retrieve_by_doc = retrieve_by_doc

    def fit_retriever(self, X=None, y=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.
         Parameters
        ----------
        X: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"
        """

        if self.retrieve_by_doc:
            self.metadata = X
            self.metadata["content"] = self.metadata["paragraphs"].apply(
                lambda x: " ".join(x))
        else:
            self.metadata = self._expand_paragraphs(X)

        self.retriever.fit(self.metadata["content"])

        return self

    def fit_reader(self, X=None, y=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.

        Parameters
        ----------
        X: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"

        """

        train_examples, train_features = self.processor_train.fit_transform(X)
        self.reader.fit(X=(train_examples, train_features))

        return self

    def predict(self, X=None, return_logit=False, n_predictions=None):
        """ Compute prediction of an answer to a question

        Parameters
        ----------
        X: str or list of strings
            Sample (question) or list of samples to perform a prediction on

        return_logit: boolean
            Whether to return logit of best answer or not. Default: False

        Returns
        -------
        If X is str
        prediction: tuple (answer, title, paragraph)

        If X is list os strings
        predictions: list of tuples (answer, title, paragraph)

        If return_logits is True, each prediction tuple will have the following
        structure: (answer, title, paragraph, best logit)

        """
        if isinstance(X, str):
            closest_docs_indices = self.retriever.predict(
                X, metadata=self.metadata)
            squad_examples = generate_squad_examples(
                question=X,
                closest_docs_indices=closest_docs_indices,
                metadata=self.metadata,
                retrieve_by_doc=self.retrieve_by_doc)
            examples, features = self.processor_predict.fit_transform(
                X=squad_examples)
            prediction = self.reader.predict((examples, features),
                                             return_logit, n_predictions)
            return prediction

        elif isinstance(X, list):
            predictions = []
            for query in X:
                closest_docs_indices = self.retriever.predict(
                    query, metadata=self.metadata)
                squad_examples = generate_squad_examples(
                    question=query,
                    closest_docs_indices=closest_docs_indices,
                    metadata=self.metadata,
                )
                examples, features = self.processor_predict.fit_transform(
                    X=squad_examples)
                pred = self.reader.predict((examples, features), return_logit,
                                           n_predictions)
                predictions.append(pred)

            return predictions

        else:
            raise TypeError("The input is not a string or a list. \
                            Please provide a string or a list of strings as input"
                            )

    def to(self, device):
        """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda'
        """
        if device not in ("cpu", "cuda"):
            raise ValueError("Attribute device should be 'cpu' or 'cuda'.")

        self.reader.model.to(device)
        self.reader.device = torch.device(device)
        return self

    def cpu(self):
        """ Send reader to CPU
        """
        self.reader.model.cpu()
        self.reader.device = torch.device("cpu")
        return self

    def cuda(self):
        """ Send reader to GPU
        """
        self.reader.model.cuda()
        self.reader.device = torch.device("cuda")
        return self

    def dump_reader(self, filename):
        """ Dump reader model to a .joblib object
        """
        joblib.dump(self.reader, filename)

    @staticmethod
    def _expand_paragraphs(df):
        # Snippet taken from: https://stackoverflow.com/a/48532692/11514226
        lst_col = "paragraphs"
        df = pd.DataFrame({
            col: np.repeat(df[col].values, df[lst_col].str.len())
            for col in df.columns.drop(lst_col)
        }).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
        df["content"] = df["paragraphs"]
        return df.drop("paragraphs", axis=1)
Example #3
0
from cdqa.utils.converters import df2squad
json_data = df2squad(df=df_X, squad_version='v1.1', output_dir='.', filename='qna_tim_ferriss')
# From there we can use https://github.com/cdqa-suite/cdQA-annotator to create a supervised problem

#%% [markdown]
# ### Fine-Tuning

#%%
# Fine-tune Bert model with squad v1.1 custom data set of Tim Ferriss questions
import os
import torch
from sklearn.externals import joblib
from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA

train_processor = BertProcessor(do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(X='cdqa-v1.1-tim_qna.json')

reader = BertQA(train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
                do_lower_case=True,
                output_dir='models')

reader.fit(X=(train_examples, train_features))

# Output fine-tuned model
reader.model.to('cpu')
reader.device = torch.device('cpu')
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_tim_qa_vCPU.joblib'))

#%% [markdown]
Example #4
0
import os
import torch
from sklearn.externals import joblib
from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA

# pre-process examples
train_processor = BertProcessor(do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(
    X='data/train-v1.1.json')

# train the model
reader = BertQA(train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
                do_lower_case=True,
                fp16=False,
                output_dir='models')

reader.fit(X=(train_examples, train_features))

# save GPU version locally
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_vGPU.joblib'))

# send current reader model to CPU
reader.model.to('cpu')
reader.device = torch.device('cpu')

# save CPU it locally
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_vCPU.joblib'))