def test_regular_encode(self): """Test that encoding is done properly.""" test_input = ["this is a test", "so is this"] len_encoding = 20 encoded_input = regular_encode(test_input, self.test_tokenizer, len_encoding) expected_encoded_input = np.array([[0, 9226, 16, 10, 1296, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2527, 16, 42, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) self.assertTrue((encoded_input == expected_encoded_input).all())
def make_predictions(df: pd.DataFrame, model, model_name: str, max_len: int = 512, method: str = "multiclass"): """ Make predictions using trained model and data to predict on. :param df: Pandas DataFrame containing data to predict on :param model: end-to-end trained Transformer model :param model_name: name of model to be loaded by Transformer to get proper tokenizer :param max_len: max length of string to be encoded :param method: "multiclass" or "binary"--describes setting for prediction outputs :return: Pandas DataFrame augmented with predictions made using trained model """ # First insert the CLS and SEP tokens inputs = [] for i in range(len(df)): # NOTE: this expects columns named "text1" and "text2" for the two claims inputs.append( str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2'])) # Then make predictions tokenizer = AutoTokenizer.from_pretrained(model_name) encoded_inputs = regular_encode(inputs, tokenizer, maxlen=max_len) predictions = model.predict(encoded_inputs) if method == "multiclass": # NEED TO CHECK THIS!!! df['predicted_con'] = predictions[:, 2] df['predicted_ent'] = predictions[:, 1] df['predicted_neu'] = predictions[:, 0] # Calculate predicted class as the max predicted label df['predicted_class'] = df[[ 'predicted_con', 'predicted_ent', 'predicted_neu' ]].idxmax(axis=1) df.predicted_class.replace(to_replace={ 'predicted_con': 'contradiction', 'predicted_ent': 'entailment', 'predicted_neu': 'neutral' }, inplace=True) elif method == "binary": df.predicted_con = predictions[:, 0] else: raise ValueError( f"{method} not a valid method type. Must be \"multiclass\" or \"binary\"" ) return df
def make_predictions(df: pd.DataFrame, model, model_name: str, max_len: int = 512, multi_class: bool = True): """ Make predictions using trained model and data to predict on. :param df: Pandas DataFrame containing data to predict on :param model: end-to-end trained Transformer model :param model_name: name of model to be loaded by Transformer to get proper tokenizer :param max_len: max length of string to be encoded :param multi_class: "multiclass" or "binary"--describes setting for prediction outputs :return: Pandas DataFrame augmented with predictions made using trained model """ # First insert the CLS and SEP tokens inputs = [] # NOTE: this expects columns named "text1" and "text2" for the two claims if multi_class: for i in range(len(df)): inputs.append( str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2'])) else: # Add the category info (CON, ENT, NEU) as auxillary text at the end for i in range(len(df)): inputs.append( str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2'] + '[SEP]' + 'CON')) # noqa: for i in range(len(df)): inputs.append( str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2'] + '[SEP]' + 'ENT')) # noqa: W503 for i in range(len(df)): inputs.append( str('[CLS]' + df.loc[i, 'text1'] + '[SEP]' + df.loc[i, 'text2'] + '[SEP]' + 'NEU')) # noqa: W503 # Then make predictions tokenizer = AutoTokenizer.from_pretrained(model_name) encoded_inputs = regular_encode(inputs, tokenizer, maxlen=max_len, multi_class=multi_class) predictions = model.predict(encoded_inputs) if multi_class: df['predicted_con'] = predictions[:, 2] df['predicted_ent'] = predictions[:, 1] df['predicted_neu'] = predictions[:, 0] else: # Note: For the binary method using auxillary input, after retrieving the prediction probability # for each class, we structure the prediction output dataframe in the same format # as the multiclass method. df['predicted_con'] = predictions[0:len(df)] df['predicted_ent'] = predictions[len(df):(2 * len(df))] df['predicted_neu'] = predictions[(2 * len(df)):] # Calculate predicted class as the max predicted label df['predicted_class'] = df[[ 'predicted_con', 'predicted_ent', 'predicted_neu' ]].idxmax(axis=1) df.predicted_class.replace(to_replace={ 'predicted_con': 'contradiction', 'predicted_ent': 'entailment', 'predicted_neu': 'neutral' }, inplace=True) return df