Ejemplo n.º 1
0
    def test_regular_encode(self):
        """Test that encoding is done properly."""
        test_input = ["this is a test", "so is this"]
        len_encoding = 20
        encoded_input = regular_encode(test_input, self.test_tokenizer, len_encoding)
        expected_encoded_input = np.array([[0, 9226, 16, 10, 1296, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                                           [0, 2527, 16, 42, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

        self.assertTrue((encoded_input == expected_encoded_input).all())
def make_predictions(df: pd.DataFrame,
                     model,
                     model_name: str,
                     max_len: int = 512,
                     method: str = "multiclass"):
    """
    Make predictions using trained model and data to predict on.

    :param df: Pandas DataFrame containing data to predict on
    :param model: end-to-end trained Transformer model
    :param model_name: name of model to be loaded by Transformer to get proper tokenizer
    :param max_len: max length of string to be encoded
    :param method: "multiclass" or "binary"--describes setting for prediction outputs
    :return: Pandas DataFrame augmented with predictions made using trained model
    """
    # First insert the CLS and SEP tokens
    inputs = []
    for i in range(len(df)):
        # NOTE: this expects columns named "text1" and "text2" for the two claims
        inputs.append(
            str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2']))

    # Then make predictions
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    encoded_inputs = regular_encode(inputs, tokenizer, maxlen=max_len)
    predictions = model.predict(encoded_inputs)

    if method == "multiclass":
        # NEED TO CHECK THIS!!!
        df['predicted_con'] = predictions[:, 2]
        df['predicted_ent'] = predictions[:, 1]
        df['predicted_neu'] = predictions[:, 0]
        # Calculate predicted class as the max predicted label
        df['predicted_class'] = df[[
            'predicted_con', 'predicted_ent', 'predicted_neu'
        ]].idxmax(axis=1)
        df.predicted_class.replace(to_replace={
            'predicted_con': 'contradiction',
            'predicted_ent': 'entailment',
            'predicted_neu': 'neutral'
        },
                                   inplace=True)
    elif method == "binary":
        df.predicted_con = predictions[:, 0]
    else:
        raise ValueError(
            f"{method} not a valid method type. Must be \"multiclass\" or \"binary\""
        )

    return df
Ejemplo n.º 3
0
def make_predictions(df: pd.DataFrame,
                     model,
                     model_name: str,
                     max_len: int = 512,
                     multi_class: bool = True):
    """
    Make predictions using trained model and data to predict on.

    :param df: Pandas DataFrame containing data to predict on
    :param model: end-to-end trained Transformer model
    :param model_name: name of model to be loaded by Transformer to get proper tokenizer
    :param max_len: max length of string to be encoded
    :param multi_class: "multiclass" or "binary"--describes setting for prediction outputs
    :return: Pandas DataFrame augmented with predictions made using trained model
    """
    # First insert the CLS and SEP tokens
    inputs = []
    # NOTE: this expects columns named "text1" and "text2" for the two claims
    if multi_class:
        for i in range(len(df)):
            inputs.append(
                str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' +
                    df.loc[i, 'text2']))
    else:
        # Add the category info (CON, ENT, NEU) as auxillary text at the end
        for i in range(len(df)):
            inputs.append(
                str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' +
                    df.loc[i, 'text2'] + '[SEP]' + 'CON'))  # noqa:
        for i in range(len(df)):
            inputs.append(
                str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' +
                    df.loc[i, 'text2'] + '[SEP]' + 'ENT'))  # noqa: W503
        for i in range(len(df)):
            inputs.append(
                str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' +
                    df.loc[i, 'text2'] + '[SEP]' + 'NEU'))  # noqa: W503

    # Then make predictions
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    encoded_inputs = regular_encode(inputs,
                                    tokenizer,
                                    maxlen=max_len,
                                    multi_class=multi_class)
    predictions = model.predict(encoded_inputs)

    if multi_class:
        df['predicted_con'] = predictions[:, 2]
        df['predicted_ent'] = predictions[:, 1]
        df['predicted_neu'] = predictions[:, 0]
    else:
        # Note: For the binary method using auxillary input, after retrieving the prediction probability
        # for each class, we structure the prediction output dataframe in the same format
        # as the multiclass method.
        df['predicted_con'] = predictions[0:len(df)]
        df['predicted_ent'] = predictions[len(df):(2 * len(df))]
        df['predicted_neu'] = predictions[(2 * len(df)):]

    # Calculate predicted class as the max predicted label
    df['predicted_class'] = df[[
        'predicted_con', 'predicted_ent', 'predicted_neu'
    ]].idxmax(axis=1)
    df.predicted_class.replace(to_replace={
        'predicted_con': 'contradiction',
        'predicted_ent': 'entailment',
        'predicted_neu': 'neutral'
    },
                               inplace=True)

    return df