def evaluate_model(self, emails, labels, max_num_sentences=1000000, max_num_chars=100000000, max_rows_tensor=1000000, max_seq_len=128, batch_size=32): """ Evaluate trained BERT model :param emails: dataframe where each row contains one column holding email text :type emails: cudf.Dataframe :param labels: series holding labels for each row in email dataframe :type labels: cudf.Series :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch :type max_num_sentences: int :param max_num_chars: maximum number of characters passed to tokenizer :type max_num_chars: int :param max_rows_tensor: maximum number of rows in a tokenizer output tensor :type max_rows_tensor: int :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> phish_detect.evaluate_model(emails_test, labels_test) """ test_inputs, test_masks, _ = tokenizer.tokenize_df( emails, self._hashpath, max_sequence_length=max_seq_len, max_num_sentences=max_num_sentences, max_num_chars=max_num_chars, max_rows_tensor=max_rows_tensor, do_truncate=True) test_labels = torch.tensor(labels.to_array()) test_data = TensorDataset(test_inputs, test_masks, test_labels) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) tests, true_labels = self._evaluate_testset(test_dataloader) flat_tests = [item for sublist in tests for item in sublist] flat_tests = np.argmax(flat_tests, axis=1).flatten() flat_true_labels = [ item for sublist in true_labels for item in sublist ] accuracy = accuracy_score(flat_true_labels, flat_tests) return accuracy
def test_tokenize_df(tmpdir, input_sentence): if torch.cuda.is_available(): expected_tokens, expected_masks, expected_metadata = get_expected() fname = tmpdir.mkdir("tmp_test_tokenizer").join("test1.txt") fname.write(input_sentence) assert fname.read() == input_sentence df = cudf.read_csv(fname, header=None) actual_tokens, actual_masks, actual_metadata = tokenizer.tokenize_df(df) assert actual_tokens.equal(expected_tokens) assert actual_masks.equal(expected_masks) assert actual_metadata.equal(expected_metadata)
def train_model(self, emails, labels, max_num_sentences=1000000, max_num_chars=100000000, max_rows_tensor=1000000, learning_rate=3e-5, max_seq_len=128, batch_size=32, epochs=5): """ Train the classifier :param emails: dataframe where each row contains one column holding email text :type emails: cudf.DataFrame :param labels: series holding labels for each row in email dataframe :type labels: cudf.Series :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch :type max_num_sentences: int :param max_num_chars: maximum number of characters passed to tokenizer :type max_num_chars: int :param max_rows_tensor: maximum number of rows in a tokenizer output tensor :type max_rows_tensor: int :param learning_rate: learning rate :type learning_rate: float :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int :param epoch: epoch, default is 5 :type epoch: int Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> phish_detect.train_model(emails_train, labels_train) """ emails["label"] = labels train_emails, validation_emails, train_labels, validation_labels = train_test_split( emails, 'label', train_size=0.8, random_state=2) # Tokenize training and validation train_inputs, train_masks, _ = tokenizer.tokenize_df( train_emails, self._hashpath, max_sequence_length=max_seq_len, max_num_sentences=max_num_sentences, max_num_chars=max_num_chars, max_rows_tensor=max_rows_tensor, do_truncate=True) validation_inputs, validation_masks, _ = tokenizer.tokenize_df( validation_emails, self._hashpath, max_sequence_length=max_seq_len, max_num_sentences=max_num_sentences, max_num_chars=max_num_chars, max_rows_tensor=max_rows_tensor, do_truncate=True) # convert labels to tensors train_labels = torch.tensor(train_labels.to_array()) validation_labels = torch.tensor(validation_labels.to_array()) # create dataloaders train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) self._config_optimizer(learning_rate) self._model = self._train(train_dataloader, validation_dataloader, self._model, epochs)
def predict(self, emails, max_num_sentences=1000000, max_num_chars=100000000, max_rows_tensor=1000000, max_seq_len=128, batch_size=32): """ Predict the class with the trained model :param emails: dataframe where each row contains one column holding email text :type emails: cudf.DataFrame :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch :type max_num_sentences: int :param max_num_chars: maximum number of characters passed to tokenizer :type max_num_chars: int :param max_rows_tensor: maximum number of rows in a tokenizer output tensor :type max_rows_tensor: int :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len. :type max_seq_len: int :param batch_size: batch size :type batch_size: int :return: predictions: predicted labels (0 or 1) for each email :rtype: cudf.Series Examples -------- >>> from cuml.preprocessing.model_selection import train_test_split >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8) >>> phish_detect.train_model(emails_train, labels_train) >>> predictions = phish_detect(new_emails_df) """ predict_inputs, predict_masks, _ = tokenizer.tokenize_df( emails, self._hashpath, max_sequence_length=max_seq_len, max_num_sentences=max_num_sentences, max_num_chars=max_num_chars, max_rows_tensor=max_rows_tensor, do_truncate=True) predict_inputs = predict_inputs.type(torch.LongTensor) predict_data = TensorDataset(predict_inputs, predict_masks) predict_sampler = SequentialSampler(predict_data) predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=batch_size) self._model.eval() results = [] for batch in predict_dataloader: batch = tuple(t.to(self._device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] logits = logits.detach().cpu().numpy() results.append(logits) preds = [item for sublist in results for item in sublist] preds = np.argmax(preds, axis=1).flatten() preds = cudf.Series(preds.tolist()) return preds