def get_predictions(model, data_reader, data_path): predictor = TextClassifierPredictor(model=model, dataset_reader=data_reader) data = list(data_reader.read(data_path)) size = len(data) bound = 4000 preds = [] if size > bound: times = int(size / bound) print(f"Set is too big; total size: {size}. " f"Batching {times} times.") for i in range(times): print(f"Lower: {bound*i}, Upper: {bound*(i + 1)}") preds += predictor.predict_batch_instance(data[bound * i:bound * (i + 1)]) if (size - (bound * times)) > 0: print(f"Lower: {bound*times}, Upper: {size}") preds += predictor.predict_batch_instance(data[bound * times:]) else: preds = predictor.predict_batch_instance(data) labelmap = predictor._model.vocab.get_index_to_token_vocabulary('labels') predictions = [labelmap[np.argmax(lst['probs'])] for lst in preds] actuals = [str(i['label'].label) for i in data] labels = list(labelmap.values()) return actuals, predictions, labels
def __init__(self): self.root_path = '/home/cym/jwtech_sci_bert' self.sentence_predictor = SentenceTaggerPredictor.from_path( os.path.join(self.root_path, 'modelsave_ner/model.tar.gz')) self.realtion_predictor = TextClassifierPredictor.from_path( os.path.join(self.root_path, 'modelsave_rel/model.tar.gz'), predictor_name='text_classifier')
class ClassifierPredictor: def __init__(self, model: Classifier) -> None: self.model = model self.reader = ClassificationReader(skip_start_end=True) self.predictor = TextClassifierPredictor(self.model, self.reader) def predict(self, sequences: List[str]) -> np.ndarray: probs = [self.predictor.predict(seq)['probs'] for seq in sequences] probs = np.array(probs) return probs
def test_interpret_fails_when_embedding_layer_not_found(self): inputs = {"sentence": "It was the ending that I hated"} vocab = Vocabulary() vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = TextClassifierPredictor(model, TextClassificationJsonReader()) interpreter = SmoothGradient(predictor) with raises(RuntimeError): interpreter.saliency_interpret_from_json(inputs)
def test_interpret_fails_when_embedding_layer_not_found(self): inputs = {"sentence": "I always write unit tests for my code."} vocab = Vocabulary() vocab.add_tokens_to_namespace( [w for w in inputs["sentence"].split(" ")]) model = FakeModelForTestingInterpret( vocab, max_tokens=len(inputs["sentence"].split(" "))) predictor = TextClassifierPredictor(model, TextClassificationJsonReader()) hotflipper = Hotflip(predictor) with raises(RuntimeError): hotflipper.initialize()
# Simple LSTM if simple_lstm: EMBEDDING_DIM = 128 HIDDEN_DIM = 128 reader = StanfordSentimentTreeBankDatasetReader() train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') test_dataset = reader.read('data/stanfordSentimentTreebank/trees/test.txt') vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, lstm, vocab) with open("models/simple_LSTM_sentiment_classifier.th", 'rb') as f: model.load_state_dict(torch.load(f)) predictor = TextClassifierPredictor(model, dataset_reader=reader) test_results = predictor.predict_batch_instance(test_dataset) # ELMo LSTM if elmo_lstm: elmo_embedding_dim = 256 HIDDEN_DIM = 128 elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader(token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') test_dataset = reader.read('data/stanfordSentimentTreebank/trees/test.txt') vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) options_file = 'data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
# Training trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=validation_data, patience=2, # early stopping if it is stuck for 2 epochs num_epochs= 10, # we should increase the number of epoch later, it's just for early try cuda_device=cuda_device) trainer.train() # predictor pre_example = 'Good morning' predictor = TextClassifierPredictor() pre = predictor.predict(pre_example) # save the model with open('./tmp/classifier_biattention_model.th', 'wb') as f: torch.save(model.state_dict(), f) # save the vocabulary vocab.save_to_files('./tmp/vocabulary') # reload the model # vocab2 = Vocabulary.from_files('./tmp/vocabulary') # model2 = BiattentiveClassificationNetwork(word_embeddings, encoder, vocab2) # with open('./tmp/classifier_biattention_model.th', 'rb') as f: # model2.load_state_dict(torch.load(f)) # if cuda_device > -1:
def __init__(self, model: Classifier) -> None: self.model = model self.reader = ClassificationReader(skip_start_end=True) self.predictor = TextClassifierPredictor(self.model, self.reader)
def _get_classifier_from_args(vocab: Vocabulary, path: str): with open(path) as file: args = json.load(file) num_classes = args['num_classes'] return get_classification_model(vocab, int(num_classes)) if __name__ == '__main__': args = parser.parse_args() class_reader = ClassificationReader(skip_start_end=True) class_vocab = Vocabulary.from_files(Path(args.classifier_path) / 'vocab') class_model = _get_classifier_from_args(class_vocab, Path(args.classifier_path) / 'args.json') load_weights(class_model, Path(args.classifier_path) / 'best.th') predictor = TextClassifierPredictor(class_model, class_reader) max_tokens = args.max_tokens or class_vocab.get_vocab_size('tokens') attacker = HotFlipFixed(predictor, max_tokens=max_tokens) attacker.initialize() data = pd.read_csv(args.csv_path) sequences = data['sequences'].tolist()[:args.sample] labels = data['labels'].tolist()[:args.sample] results_path = Path(args.results_path) / datetime.now().strftime('%Y%m%d_%H%M%S') results_path.mkdir(exist_ok=True, parents=True) path_to_results_file = results_path / 'results.csv' dump_metrics(results_path / 'args.json', args.__dict__) with open(path_to_results_file, 'w', newline='') as csv_write: fieldnames = list(AttackerOutput.__annotations__.keys()) writer = csv.DictWriter(csv_write, fieldnames=fieldnames)