def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32): """ Evaluate trained model :param test_data: test data to evaluate model :type test_data: cudf.Series :param labels: labels for each element in test_data :type labels: cudf.Series :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> sc.evaluate_model(emails_test, labels_test) """ self._model.eval() test_gdf = cudf.DataFrame() test_gdf["text"] = test_data test_gdf["label"] = labels test_dataset = Dataset(test_gdf) test_dataloader = DataLoader(test_dataset, batchsize=batch_size) eval_accuracy = 0 nb_eval_steps = 0 for df in test_dataloader.get_chunks(): b_input_ids, b_input_mask = self._bert_uncased_tokenize( df["text"], max_seq_len) b_labels = torch.tensor(df["label"].to_numpy()) with torch.no_grad(): logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] logits = logits.type(torch.DoubleTensor).to(self._device) logits = cupy.fromDlpack(to_dlpack(logits)) label_ids = b_labels.type(torch.IntTensor).to(self._device) label_ids = cupy.fromDlpack(to_dlpack(label_ids)) temp_eval_accuracy = self._flatten_accuracy(logits, label_ids) eval_accuracy += temp_eval_accuracy nb_eval_steps += 1 accuracy = eval_accuracy / nb_eval_steps return float(accuracy)
def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5): """ Predict the class with the trained model :param input_data: input text data for prediction :type input_data: cudf.Series :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int :param threshold: results with probabilities higher than this will be labeled as positive :type threshold: float :return: predictions, probabilities: predictions are labels (0 or 1) based on minimum threshold :rtype: cudf.Series, cudf.Series Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> sc.train_model(emails_train, labels_train) >>> predictions = sc.predict(emails_test, threshold=0.8) """ predict_gdf = cudf.DataFrame() predict_gdf["text"] = input_data predict_dataset = Dataset(predict_gdf) predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size) preds = cudf.Series() probs = cudf.Series() self._model.eval() for df in predict_dataloader.get_chunks(): b_input_ids, b_input_mask = self._bert_uncased_tokenize( df["text"], max_seq_len) with torch.no_grad(): logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] b_probs = torch.sigmoid(logits[:, 1]) b_preds = b_probs.ge(threshold) b_probs = cudf.io.from_dlpack(to_dlpack(b_probs)) b_preds = cudf.io.from_dlpack(to_dlpack(b_preds)) preds = preds.append(b_preds) probs = probs.append(b_probs) return preds, probs
def train_model( self, train_data, labels, learning_rate=3e-5, max_seq_len=128, batch_size=32, epochs=5, ): """ Train the classifier :param train_data: text data for training :type train_data: cudf.Series :param labels: labels for each element in train_data :type labels: cudf.Series :param learning_rate: learning rate :type learning_rate: float :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int :param epoch: epoch, default is 5 :type epoch: int Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> sc.train_model(emails_train, labels_train) """ train_gdf = cudf.DataFrame() train_gdf["text"] = train_data train_gdf["label"] = labels train_dataset = Dataset(train_gdf) train_dataloader = DataLoader(train_dataset, batchsize=batch_size) self._config_optimizer(learning_rate) self._model.train() # Enable training mode self._tokenizer = SubwordTokenizer(self._hashpath, do_lower_case=True) for _ in trange(epochs, desc="Epoch"): tr_loss = 0 # Tracking variables nb_tr_examples, nb_tr_steps = 0, 0 for df in train_dataloader.get_chunks(): b_input_ids, b_input_mask = self._bert_uncased_tokenize( df["text"], max_seq_len) b_labels = torch.tensor(df["label"].to_numpy()) self._optimizer.zero_grad() # Clear out the gradients loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0] # forwardpass loss.sum().backward() self._optimizer.step() # update parameters tr_loss += loss.sum().item() # get a numeric value nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps))
"bankmobile.com", ], "type": [1, 1, 0, 1], }) expected_part_df1 = cudf.DataFrame({ "domain": [ "studytour.com.tw", "cnn.com", ], "type": [1, 1], }) expected_part_df2 = cudf.DataFrame({ "domain": [ "bakercityherald.com", "bankmobile.com", ], "type": [0, 1], }) dataset = Dataset(test_df) dataloader = DataLoader(dataset, batchsize=test_batchsize) def test_get_chunks(): df_parts = [] for df_part in dataloader.get_chunks(): df_parts.append(df_part) assert len(df_parts) == 2 assert df_parts[0].reset_index(drop=True).equals(expected_part_df1) assert df_parts[1].reset_index(drop=True).equals(expected_part_df2)