def __init__(self, model, encoder, class_names, tokenize_on_space, use_unk=False, unk='<unk>', exp_th=0.95, seed=1): """ :param use_unk: If False, replaces words by similar words instead of UNKs :param unk: the symbol to use for unknown words """ self.model = model self.encoder = encoder self.use_unk = use_unk self.unk = unk self.threshold = exp_th # need to install this spacy module separately to enable word similarity self.nlp = spacy.load("en_core_web_lg") if tokenize_on_space: self.nlp.tokenizer = Tokenizer(self.nlp.vocab) else: self.nlp.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp) np.random.seed(seed) self.explainer = anchor_text.AnchorText( self.nlp, class_names, use_unk_distribution=self.use_unk, mask_string=self.unk)
def main(): """Fine-tune BERT for a given task with given parameters.""" # Define all parameters, using argparse/Command Line Interface. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) def add_args(): """Add all possible options and defaults to the parser.""" # Hyperparameters of BERT # Parameters often changed parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, " "bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") # Parameters usually unchanged parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") # Parameters of the task parser.add_argument("--task_name", default="node", type=str, help="The name of the task to train. One of node, political-as, " "political-ru, political-asu, agreement, node-ext, political-as-topics," "political-ru-topics, political-asu-topics, agreement-topics") parser.add_argument("--input_to_use", type=str, default="both", help="Which input to use. One of both, org, response, response-org.") # Parameters for reproduction parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") # Parameters for where to save/load data parser.add_argument("--data_dir", default="../data", type=str, help="The input data dir. Should contain the .tsv file (or other data files) for the task.") parser.add_argument("--output_dir", default="run", type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") # Parameters to decide what to do (train, test, crossval, save the model) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_train_eval", action='store_true', help="Whether to run training and eval.") parser.add_argument('--n_times', type=int, default=10, help="Number of restarts for every parameter setting in train&eval mode") parser.add_argument("--do_cross_val", action='store_true', help="Whether to run cross-validation.") parser.add_argument("--do_save", action='store_true', help="Whether to save the resulting model.") parser.add_argument("--do_visualization", action='store_true', help="Whether to run visualization.") # Additional parameters parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--log_level', type=str, default="info", help="Verbosity of logging output. One of info or warn.") # Add all parameters to the parser and parse them. add_args() args = parser.parse_args() # Set up all parameters given the CLI arguments. device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() args.device = device task_name = args.task_name.lower() processor = processors[task_name](args.input_to_use) label_list = processor.get_labels() num_labels = len(label_list) global_step = 0 tr_loss = 0 tb_writer = SummaryWriter() # Prepare the logging. logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.log_level == "info" else logging.WARN) logger.info("device: {} n_gpu: {}".format( device, n_gpu)) # Check the arguments and fail if the arguments are invalid. if not args.do_train and not args.do_eval and not args.do_cross_val and not args.do_visualization \ and not args.do_train_eval: raise ValueError("At least one of `do_train`, `do_eval` `do_cross_val` " "or `do_visualization` or 'do_train_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. " "Use the --overwrite_output_dir option.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) # Calculate the train_batch_size if gradient accumulation is used args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Set all seeds for reproducibility random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) def get_features_examples(mode): """Returns the features and examples of train or test mode.""" def convert(split, modus, exs): """Converts the examples or load them from cache.""" cached_features_file = os.path.join(args.data_dir, 'cache', '{0}_{1}_{2}_{3}_{4}_{5}'.format(modus, list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name), str(args.input_to_use), split)) # Try to load the cached features. try: with open(cached_features_file, "rb") as reader: fs = pickle.load(reader) # Creates and cache the features. except FileNotFoundError: if not os.path.exists(os.path.join(args.data_dir, 'cache')): os.makedirs(os.path.join(args.data_dir, 'cache')) fs = convert_examples_to_features( exs, label_list, args.max_seq_length, tokenizer) logger.info('Saving {0} features into cached file {1}'.format(mode, cached_features_file)) with open(cached_features_file, "wb") as writer: pickle.dump(fs, writer) return fs # Return the features, examples and dataframes depending on the mode. if mode == "train": train_ex, df = processor.get_train_examples(args.data_dir) return convert("X", mode, train_ex), train_ex, df elif mode == "dev": dev_ex, df = processor.get_dev_examples(args.data_dir) return convert("X", mode, dev_ex), dev_ex, df elif mode == "cross_val": data = processor.get_splits(args.data_dir) train_f_list, train_e_list, train_df_list, test_f_list, test_e_list, test_df_list = ([] for _ in range(6)) for i, (train_ex, train_df, test_ex, test_df) in enumerate(data): train_e_list.append(train_ex) train_df_list.append(train_df) test_e_list.append(test_ex) test_df_list.append(test_df) # Create features from the examples train_f_list.append(convert(i, "train", train_ex)) test_f_list.append(convert(i, "dev", test_ex)) return train_f_list, train_e_list, train_df_list, test_f_list, test_e_list, test_df_list else: raise ValueError("Invalid feature mode.") def create_tensor_dataset(exfeatures): """Creates a TensoDataset out of the features.""" all_input_ids = torch.tensor([f.input_ids for f in exfeatures], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in exfeatures], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in exfeatures], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in exfeatures], dtype=torch.long) return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) def do_training(train_fs, train_exs): """Runs BERT fine-tuning.""" # Allows to write to enclosed variables global_step nonlocal global_step # Create the batched training data out of the features. train_data = create_tensor_dataset(train_fs) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Calculate the number of optimization steps. num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer. param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Log some information about the training. logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_exs)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Set the model to training mode and train for X epochs. model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Iterate over all batches. for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # Get the Logits and calculate the loss. logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) # Scale the loss in gradient accumulation mode. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # Calculate the gradients. loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 # Update the weights every gradient_accumulation_steps steps. if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) def do_save(): """Saves the current model, tokenizer and arguments.""" nonlocal model nonlocal tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Using the predefined names, we can load using `from_pretrained`. output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # Save the trained model, configuration and tokenizer torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Save the training arguments together with the trained model. output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) def do_eval(eval_features, eval_examples): """Do evaluation on the current model.""" # Logg some information. logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Get the eval data and create a sequential dataloader. eval_data = create_tensor_dataset(eval_features) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Set the model to eval mode (disable dropout) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None # Iterate over the evaluation data. for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # Forward pass with deactivated autograd engine. with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # Calculate eval loss. tmp_eval_loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) # Calculate the mean loss and get all predictions. eval_loss = eval_loss / nb_eval_steps loss = tr_loss/global_step if args.do_train else None preds = preds[0] preds = np.argmax(preds, axis=1) # Compute the metrics for the given task result = compute_metrics(task_name, preds, out_label_ids) # Save additional information in the result dict. result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss # Save all settings for external evaluation result['_task'] = task_name result['_input_mode'] = args.input_to_use result['_learning_rate'] = args.learning_rate result['_bert-model'] = args.bert_model result['_batch_size'] = args.train_batch_size result['_warmup'] = args.warmup_proportion result['_num_epochs'] = args.num_train_epochs result['_seq_len'] = args.max_seq_length result['_seed'] = args.seed result['_gradient_acc'] = args.gradient_accumulation_steps return result, preds def save_results(result_list, pred_list): """Saves the results and the predictions.""" # Save the results in a text file. output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for i, result_dict in enumerate(result_list): logger.info("Run %i", i) writer.write("Run %i\n" % i) for key in sorted(result_dict.keys()): if not key.startswith("_"): logger.info(" %s = %s", key, str(result_dict[key])) writer.write("%s = %s\n" % (key, str(result_dict[key]))) # Save the results and predictions in csv and tsv files. output_csv_file = os.path.join(args.output_dir, "../eval_results.tsv") output_preds_file = os.path.join(args.output_dir, "../eval_preds.csv") df_res = pd.DataFrame(result_list) df_preds = pd.DataFrame(pred_list) df_preds['run'] = '{0}_{1}_{2}_{3}'.format( args.bert_model, args.num_train_epochs, args.train_batch_size, args.learning_rate) # If the files do not exist, create them with headers. if not os.path.exists(output_csv_file): df_res.to_csv(output_csv_file, encoding='utf-8', sep='\t', index=False) df_preds.to_csv(output_preds_file, encoding='utf-8', index=False) # If the files already exist, just append to them without headers. else: df_res.to_csv(output_csv_file, mode='a', encoding='utf-8', sep='\t', index=False, header=False) df_preds.to_csv(output_preds_file, mode='a', encoding='utf-8', index=False, header=False) # Load the tokenizer and the model. tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) # Train and test . if args.do_train_eval: # Get the train and test features only once. train_features, train_examples, _ = get_features_examples("train") test_features, test_examples, _ = get_features_examples("dev") # Repeat N times. for i in range(args.n_times): # Train. do_training(train_features, train_examples) # Eval. result, preds = do_eval(test_features, test_examples) # Save the results. save_results([result], [preds]) # Reset and new seeds. if i+1 < args.n_times: args.seed += 1 random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Reset model. model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) # Training if args.do_train: # Get the train features. features, examples, df = get_features_examples("train") # Train. do_training(features, examples) # Save the model if wanted. if args.do_save: do_save() # Evaluation. if args.do_eval: # Get the dev features. features, examples, df = get_features_examples("dev") # Evaluate. result, preds = do_eval(features, examples) # Save the results. save_results([result], [preds]) # CrossVal. if args.do_cross_val: # Get the data for all splits train_f_l, train_e_l, train_df_l, test_f_l, test_e_l, test_df_l = get_features_examples("cross_val") # Iterate over all splits for train_features, train_examples, test_features, test_examples in zip( train_f_l, train_e_l, test_f_l, test_e_l): # Reset model. model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) # Train. do_training(train_features, train_examples) # Eval. result, preds = do_eval(test_features, test_examples) # Save results. save_results([result], [preds]) # Visualization. if args.do_visualization: # Additional imports needed for the visualizations. import spacy from skorch import NeuralNetClassifier from sklearn.pipeline import make_pipeline from run_classifier_dataset_utils import InputExample from anchor import anchor_text from lime.lime_text import LimeTextExplainer # Example sentences. raw_text_1 = "But Mr. Nixon did n't say a word that was ever publicly recorded . Even more incredible , " \ "he did n't say a word when the Communists took power in Cuba - not 4 miles off their shores , " \ "but only 90 miles off our shores . Mr. Nixon saw what was happening in Cuba ." raw_text_2 = "Cordoba House is no act of tolerance, but of excess/arrogance. Building this structure on the " \ "edge of the battlefield created by radical Islamists is not a celebration of " \ "religious pluralism and mutual tolerance; it is a political statement of shocking arrogance " \ "and hypocrisy." raw_text_3 = "Are not right no does he alcohol child china play" raw_text_list = [raw_text_1, raw_text_2, raw_text_3] class BertConverter: """Pipeline-Class to convert text to the input format of BERT.""" def transform(self, X, y=None, **fit_params): """Transforms a list of strings to a list of BERT inputs.""" exs = [] for text in X: exs.append(InputExample(guid=None, text_a=text, text_b=None, label="attack")) visu_features = convert_examples_to_features(exs, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in visu_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in visu_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in visu_features], dtype=torch.long) return [all_input_ids, all_segment_ids, all_input_mask] def fit(self, X, y=None, **fit_params): return self class MyBERT(torch.nn.Module): """Class to wrap the current BERT model.""" def __init__(self): super(MyBERT, self).__init__() self.model = model def forward(self, X): """Apply a softmax function to the output of the BERT model.""" return torch.nn.functional.softmax(self.model(*X), dim=1) # Creates a NeuralNetClassifier. if device == torch.device('cuda'): net = NeuralNetClassifier(MyBERT, device='cuda', max_epochs=0, lr=0.0, train_split=None) else: net = NeuralNetClassifier(MyBERT, max_epochs=0, lr=0.0, train_split=None) # Set up the pipeline. c = make_pipeline(BertConverter(), net) # To initialize the pipeline (does not train, because epochs=0). c.fit(raw_text_list, y=torch.zeros(len(raw_text_list), dtype=torch.long)) # Print the predictions and probabilities for the example texts. print(c.predict_proba(raw_text_list)) # Creates the LimeTextExplainer. # bow=True to replace all occurrences of a string at once. explainer = LimeTextExplainer(class_names=processor.get_labels(), bow=False, mask_string="[UNK]") # Explain the first example in the list and save the result using LIME. idx = 0 exp = explainer.explain_instance(raw_text_list[idx], c.predict_proba) print('Document id: %d' % idx) print('Probability(support) =', c.predict_proba([raw_text_list[idx]])[0, 1]) print('True class: %s' % "None") print(exp.as_list()) exp.save_to_file(os.path.join(args.output_dir, "lime.html")) # Explain the first example using the ANCHOR explainer and save the result. nlp = spacy.load("en_core_web_sm") explainer2 = anchor_text.AnchorText(nlp, processor.get_labels(), use_unk_distribution=True) exp2 = explainer2.explain_instance(raw_text_list[idx], c.predict, threshold=0.95, use_proba=True) pred = explainer2.class_names[c.predict([raw_text_list[idx]])[0]] alternative = explainer2.class_names[1 - c.predict([raw_text_list[idx]])[0]] print('Anchor: %s' % (' AND '.join(exp2.names()))) print('Precision: %.2f\n' % exp2.precision()) print('Examples where anchor applies and model predicts %s:\n' % pred) print('\n'.join([x[0] for x in exp2.examples(only_same_prediction=True)])) print('Examples where anchor applies and model predicts %s:\n' % alternative) print('\n'.join([x[0] for x in exp2.examples(only_different_prediction=True)])) exp2.save_to_file(os.path.join(args.output_dir, "anchor.html"))
# In[91]: # this is the requested function by Anchors! #@timeit def predict_text(text): return model.predict(vectorizer.transform(text)) # In[57]: # build explanator explanator = anchor_text.AnchorText(nlp, ["negative", "positive"], use_unk_distribution=False) # In[55]: predict_text(["Good film"]) # In[85]: explain_sample = train[:30] # In[25]:
# train deepmatcher model = dm.MatchingModel(attr_summarizer='hybrid') model.load_state('da_dm.pth') #model.run_train(trainLab, validationLab, best_save_path='da_dm.pth', epochs=15) # evaluate deepmatcher on test data eval = model.run_eval(testLab) # transform test data to feed it to anchors test_df = pd.read_csv(datadir + '/merged_test.csv') pairs_str_test = pairs_to_string(test_df,'ltable_','rtable_') # create anchors text explainer instance class_names = ["non-matching","matching"] nlp = spacy.load('en_core_web_lg') explainer = anchor_text.AnchorText(nlp, class_names, mask_string='', use_unk_distribution=True, use_bert=False) verbose = False e_values = {0: [''], 1: ['']} threshold = 51 print(f'using {len(pairs_str_test)} test samples') for t_i in pairs_str_test: try: if len(e_values[0]) >= threshold and len(e_values[1]) >= threshold: print('finished!') break # perform prediction on test instance fn_result = predict_fn([t_i]) result_key = fn_result[0] if len(e_values[result_key]) < threshold: pred = explainer.class_names[result_key]
with open(in_file, 'r') as f: file_data = f.read() # In[11]: parser = PlaintextParser.from_file(in_file, Tokenizer(LANGUAGE)) summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words('slovak') helper = _summarizer.AbstractSummarizer() # In[36]: explanator = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=True) # In[13]: # define a decorator to log execusion time # inspired by https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d def timeit(method): def timed(*args, **kw): timed.calls += 1 ts = time.time() result = method(*args, **kw) te = time.time() timed.time_taken += (te - ts) * 1000 return result
from anchor import anchor_text from anecdotes_utils import anecdotes_predict_anchor, anecdotes_labels, get_merged_instance, anecdotes_exp_dir import spacy import time instance_idx = 0 nlp = spacy.load('en_core_web_lg') #nlp = spacy.load('en_core_web_sm') #FOR BERT but throws error when using explainer = anchor_text.AnchorText(nlp, anecdotes_labels, use_unk_distribution=False, use_bert=True) # BERT limits to 512 tokens, anchor implementation does not take this into account # so anecdotes are truncated before being explained start_time = time.time() exp = explainer.explain_instance(get_merged_instance(instance_idx, truncate=True), anecdotes_predict_anchor, threshold=0.9) end_time = time.time() exp.save_to_file(anecdotes_exp_dir + "anchor0.html") print('done in :%f', end_time - start_time)
input_tensor = torch.stack( [tensor_txt_inputs, tensor_attention_masks], dim=1) output = net(input_tensor) #predicted = torch.argmax(output.data, 1) res[i] = output.numpy()[0] i = ++i print(res) return res nlp = spacy.load( '/home/julien/Documents/stage/anchor/datasets/en_core_web_lg-2.2.5/en_core_web_lg/en_core_web_lg-2.2.5' ) #nlp = spacy.load('/udd/jdelauna/Documents/anchor/datasets/fr_core_news_sm-2.2.5/fr_core_news_sm/fr_core_news_sm-2.2.5') explainer = anchor_text.AnchorText(nlp, ['true news', 'fake news'], use_unk_distribution=True) np.random.seed(1) #text = 'This is a good book . I will learn a lot in this book . Maybe one day I will be an expert in such a domain .' #text = "DUBAI, April 19 (Reuters) - Singapore-based Lloyd’s of London insurer, Global Specialty Brokers (GSB), said on Monday.It had suspended flights to Hong Kong from Qatar Airways." #text = "DUBAI , April 19 (Reuters) - Singapore - based Lloyd of London insurer , Global Specialty Brokers (GSB) , said on Monday . China destroys France ." #text = "WARSAW (Reuters) - Three new cases of the new coronavirus have been diagnosed in Poland - one man in a critical condition, and two suspected cases - the Health Ministry said on Friday.In January, a 35-year-old Iraqi man died in Poland after suffering severe respiratory infection, possibly caused by the novel coronavirus, also known as NCoV.Authorities are still trying to determine the extent of any relationship between the man, who was admitted to hospital with respiratory illness and died last month, and other possible victims in the country.As in other parts of the world, some foreign universities and medical schools have cancelled conferences or seminars due to NCoV cases in different countries, as has Poland’s health ministry.There have been no reported cases of the novel coronavirus in Poland.NCoV is a virus from the same family as the SARS virus which killed around 800 people worldwide in 2002 and 2003. Scientists believe it may have circulated before the world had developed the ability to detect it through human-to-human transmission." text = "DUBAI, April 19 (Reuters) - Singapore-based Lloyd’s of London insurer, Global Specialty Brokers (GSB), said on Monday it had suspended flights to Hong Kong from Qatar Airways after low demand since a state crackdown on fundraising by activist investors.“Since the implementation of Hong Kong regulations, low demand for our services from Qatar Airways has led us to suspend its operations,” GSB said in a statement.Hong Kong has tightened regulations on shareholder activism, including curbs on companies bringing in external directors, and launched a review of such matters after a wave of activist campaigns last year.The rules also require companies to publish a list of companies that have been conducting a financial or administrative audit for up to three years, citing concerns over the preparation of the financial statements of such companies.GSB offered alternative services to Qatar Airways, such as trading claims, claims management and reinsurance, through Lloyd’s of London in Hong Kong. It did not reveal how many passengers it had earned from services for Qatar Airways.GSB had offered “well over” 20 flights per month to Hong Kong from Doha since 2015, but with limited demand, the insurer said.The Qatar Airways spokesman said its policy is to not comment on media reports." text = text.encode('utf-8') text = str(text) #text = "We are going to extend this new method and prevent China from attacking France ." #text = "This is a good book ." pred = explainer.class_names[predict_antoine(text)] alternative = explainer.class_names[1 - predict_antoine(text)]
def main(TRAIN=False, TUNING=False, ANCHOR=False, LIME=True, STATISTICS=False, PROTODASH=False): # read poems using simplereader poems_english = readPoems('tsv/english.tsv') poems_german = readPoems('tsv/emotion.german.tsv') poems_chinese = readPoems('tsv/chinese.tsv') print(len(poems_english)) print(len(poems_german)) print(len(poems_chinese)) # set up label dictionary label_dict = { 'Sadness': 0, 'Humor': 1, 'Suspense': 2, 'Nostalgia': 3, 'Uneasiness': 4, 'Annoyance': 5, 'Awe / Sublime': 6, 'Awe/Sublime': 6, 'Vitality': 7, 'Beauty / Joy': 8, 'Beauty/Joy': 8 } # array of stanzas stanzas = [] # array of most prominent label for each stanza labels = [] # list of languages lang = [] # extract sentences with one label for poem in itertools.chain(poems_english, poems_german, poems_chinese): for stanza in poem[1:]: if poem in poems_english: lang.append(0) elif poem in poems_german: lang.append(1) else: lang.append(2) labelsPerStanza = [] currentStanzaIndex = len(stanzas) newStanza = 1 for line in stanza: if newStanza: stanzas.append(line[0]) newStanza = 0 else: stanzas[currentStanzaIndex] += " " + line[0] labelsPerStanza.extend(line[1].split(" --- ")) if len(line) > 2: labelsPerStanza.extend(line[2].split(" --- ")) counter = [0, 0, 0, 0, 0, 0, 0, 0, 0] for label in labelsPerStanza: counter[label_dict[label]] += 1 labels.append(np.argmax(counter)) # plot dataset statistics if STATISTICS is True: df = pd.DataFrame({ "stanzas": stanzas, "labels": labels, "languages": lang }) bar_labels = [lab.replace(" ", "") for lab in label_dict.keys()] ger_values = df.loc[df["languages"] == 1, "labels"].value_counts() en_values = df.loc[df["languages"] == 0, "labels"].value_counts() ch_values = df.loc[df["languages"] == 2, "labels"].value_counts() print(type(df.loc[df["languages"] == 1, "labels"].value_counts())) ger_values[3] = 0 ger_values.sort_index(inplace=True) en_values.sort_index(inplace=True) ch_values.sort_index(inplace=True) width = 0.5 fig, ax = plt.subplots() plt.grid(zorder=0, alpha=0.7) ax.bar(bar_labels, ger_values, width, label='German') ax.bar(bar_labels, en_values, width, bottom=ger_values, label='English') ax.bar(bar_labels, ch_values, width, bottom=en_values + ger_values, label='Chinese') ax.set_ylabel('Number of stanzas', fontsize=18) ax.legend(prop={'size': 18}) ax.tick_params(axis='both', which='major', labelsize=18) plt.xticks(rotation=16) plt.show() # transform labels into one hot encodings one_hot_labels = to_categorical(labels) # analyze distribution of labels in dataset df = pd.DataFrame({"labels": labels}) print(df['labels'].value_counts()) # use pretrained multilingual model to encode sentences model = SentenceTransformer('distiluse-base-multilingual-cased-v1') embeddings = model.encode(stanzas) # shuffle data and split into train and test set all_data = [(embeddings[i], one_hot_labels[i], i) for i in range(len(embeddings))] unshuffled_data = all_data random.shuffle(all_data) embeddings = [emb for emb, _, _ in all_data] labels = [lab for _, lab, _ in all_data] indices = [idx for _, _, idx in all_data] train_data = np.array(embeddings[:int(0.75 * len(embeddings))]) train_labels = np.array(labels[:int(0.75 * len(embeddings))]) dev_data = np.array( embeddings[int(0.75 * len(embeddings)):int(0.875 * len(embeddings))]) dev_labels = np.array( labels[int(0.75 * len(embeddings)):int(0.875 * len(embeddings))]) test_data = np.array(embeddings[int(0.875 * len(embeddings)):]) test_labels = np.array(labels[int(0.875 * len(embeddings)):]) # Hyperparameter Tuning if TUNING is True: learning_rates = [0.001, 0.01, 0.1] epochs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] middle_nodes = [20, 50, 100, 150, 200] losses = [] accuracies = [] max_loss = 100000 min_acc = 0 max_config = None for lr in learning_rates: for epoch in epochs: for middle_node in middle_nodes: print("Training with following hyperparameters:", lr, epoch, middle_node) adam = Adam(learning_rate=lr) mdl = Sequential() mdl.add( Dense(middle_node, input_dim=512, kernel_initializer="uniform", activation="relu")) mdl.add( Dense(9, activation="softmax", kernel_initializer="uniform")) mdl.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"]) mdl.fit(train_data, train_labels, epochs=epoch, verbose=1) print("evaluating on dev set...") (loss, accuracy) = mdl.evaluate(dev_data, dev_labels, verbose=1) print("loss: {:.4f}, accuracy: {:.4f}%".format( loss, accuracy * 100)) losses.append(loss) accuracies.append(accuracy) if accuracy > min_acc: min_acc = accuracy max_config = (lr, epoch, middle_node) print(max_config) max_config = (0.01, 7, 150) mdl = Sequential() if TRAIN is True: # use final model adam = Adam(learning_rate=max_config[0]) mdl = Sequential() mdl.add( Dense(max_config[2], input_dim=512, kernel_initializer="uniform", activation="relu")) mdl.add(Dense(9, activation="softmax", kernel_initializer="uniform")) mdl.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"]) mdl.fit(train_data, train_labels, epochs=max_config[1], verbose=1) print("evaluating on test set...") (loss, accuracy) = mdl.evaluate(test_data, test_labels, verbose=1) print("loss={:.4f}, accuracy: {:.4f}%".format(loss, accuracy * 100)) #print("precision={:.4f}%".format(precision * 100)) #print("recall={:.4f}%".format(recall * 100)) # mdl.save('emotion_classifier') #mdl = keras.models.load_model('emotion_classifier') (loss, accuracy) = mdl.evaluate(test_data, test_labels, verbose=1) y_pred = mdl.predict(test_data, batch_size=test_data.shape[0]) wrong_classified_idx = [] for j, idx in enumerate(indices[int(0.875 * len(embeddings)):]): if np.argmax(y_pred[j]) != np.where(test_labels[j] == 1.0)[0]: wrong_classified_idx.append(idx) print("These stanzas were wronlgy classified:") print(wrong_classified_idx) wrong_classified_en = [idx for idx in wrong_classified_idx if idx < 167] wrong_classified_ger = [ idx for idx in wrong_classified_idx if (idx >= 167 and idx < 688) ] wrong_classified_ch = [idx for idx in wrong_classified_idx if idx >= 688] total_en = [ idx for idx in indices[int(0.875 * len(embeddings)):] if idx < 167 ] total_ger = [ idx for idx in indices[int(0.875 * len(embeddings)):] if (idx >= 167 and idx < 688) ] total_ch = [ idx for idx in indices[int(0.875 * len(embeddings)):] if idx >= 688 ] print("Number of wrongly classified stanzas - English: ", len(wrong_classified_en)) print("Number of wrongly classified stanzas - German: ", len(wrong_classified_ger)) print("Number of wrongly classified stanzas - Chinese: ", len(wrong_classified_ch)) print("Total - English: ", len(total_en)) print("Total - German: ", len(total_ger)) print("Total - Chinese: ", len(total_ch)) class_names = [ 'Sadness', 'Humor', 'Suspense', 'Nostalgia', 'Uneasiness', 'Annoyance', 'Awe / Sublime', 'Vitality', 'Beauty / Joy' ] examples = [592, 9, 5] # ------------------------------------------------------------LIME-------------------------------------------------------------------------------------------- # apply LIME to obtain explanations for a specific instance def pipeline(stanza, mdl=mdl, model=model): embedded = model.encode(stanza) return mdl.predict(embedded, batch_size=embedded.shape[0]) if LIME is True: # apply LIME to 10 uncorreclty classified stanzas for idx in examples: print("True Label: ", one_hot_labels[idx]) emb = np.array(model.encode(stanzas[idx])) emb = emb.reshape((512, 1)) emb = emb.T print("Predicted Probabilities: ", mdl.predict(emb, batch_size=1)) explainer = LimeTextExplainer(class_names=class_names) exp = explainer.explain_instance(stanzas[idx], pipeline, num_features=6, top_labels=2) top_labs = exp.available_labels() print("Explanation for class {}".format(top_labs[0])) print('\n'.join(map(str, exp.as_list(label=top_labs[0])))) print("Explanation for class {}".format(top_labs[1])) print('\n'.join(map(str, exp.as_list(label=top_labs[1])))) fig = exp.as_pyplot_figure(top_labs[0]) plt.show() fig_2 = exp.as_pyplot_figure(top_labs[1]) plt.show() # apply LIME to different correctly classified stanzas idx = 5 print("True Label: ", one_hot_labels[idx]) emb = np.array(model.encode(stanzas[idx])) emb = emb.reshape((512, 1)) emb = emb.T print("Predicted Probabilities: ", mdl.predict(emb, batch_size=1)) print(mdl.predict(emb, batch_size=1).sum()) explainer = LimeTextExplainer(class_names=class_names) exp = explainer.explain_instance(stanzas[idx], pipeline, num_features=6, top_labels=2) pickle.dump(exp, open("explanation.pkl", "wb")) top_labs = exp.available_labels() print("Explanation for class {}".format(top_labs[0])) print('\n'.join(map(str, exp.as_list(label=top_labs[0])))) print("Explanation for class {}".format(top_labs[1])) print('\n'.join(map(str, exp.as_list(label=top_labs[1])))) fig = exp.as_pyplot_figure(top_labs[0]) plt.legend(prop={'size': 600}) plt.tick_params(axis='both', which='major', labelsize=600) plt.set_yticklabels(x, fontsize=600) plt.show() fig_2 = exp.as_pyplot_figure(top_labs[1]) plt.legend(prop={'size': 20}) plt.tick_params(axis='both', which='major', labelsize=20) plt.show() # ----------------------------------------------------------ANCHOR--------------------------------------------------------------------------------------------- def predict_label(stanza): embedded = model.encode(stanza) probs = mdl.predict(embedded, batch_size=embedded.shape[0]) return [np.argmax(probs[0])] def predict_second_label(stanza, predicted_label): embedded = model.encode(stanza) probs = mdl.predict(embedded, batch_size=embedded.shape[0]) probs[0][np.argmax(probs[0])] = 0 return [np.argmax(probs)] if ANCHOR is True: ids = np.zeros(3) print() # for i in examples: # lowest = 500 # lowest_id = 500 # for j in range(len(stanzas)): # if len(stanzas[j]) < lowest: # if j not in ids and len(stanzas[j]) > 85 and j < 174: # lowest = len(stanzas[j]) # lowest_id = j # ids[i] = lowest_id # print("Ausgewähltes Stanza: ", stanzas[lowest_id]) # print("Länge: ", len(stanzas[lowest_id]), " id: ", lowest_id) # print() nlp = spacy.load('en_core_web_lg') explainer = anchor_text.AnchorText(nlp, class_names, use_unk_distribution=True) print("GPU's: ", get_available_gpus()) for idx in examples: print() print("------------STANZA-", idx, "------------") print() text = stanzas[idx] print(predict_label([text])) pred = explainer.class_names[predict_label([text])[0]] alternative = explainer.class_names[predict_second_label( [text], predict_label([text])[0])[0]] print('Prediction: %s' % pred) print("Stanza: ", stanzas[idx], " True Label: ", labels[idx]) exp = explainer.explain_instance(text, predict_label, threshold=0.95) print('Anchor: %s' % (' AND '.join(exp.names()))) print('Precision: %.2f' % exp.precision()) print() print('Examples where anchor applies and model predicts %s:' % pred) print() print('\n'.join( [x[0] for x in exp.examples(only_same_prediction=True)])) print() print('Examples where anchor applies and model predicts %s:' % alternative) print() print('\n'.join([ x[0] for x in exp.examples(partial_index=0, only_different_prediction=True) ])) # ----------------------------------------------------------PROTODASH------------------------------------------------------------------------------------------ if PROTODASH is True: for idx in examples: from aix360.algorithms.protodash import ProtodashExplainer def predict_label(stanza): embedded = model.encode(stanza) embedded = embedded.reshape((512, 1)) embedded = embedded.T probs = mdl.predict(embedded, batch_size=1) return [np.argmax(probs)] def index_to_vector(index): for k, data in enumerate(all_data): if data[2] == index: return embeddings[k] return None explainer = ProtodashExplainer() num_prototypes = 5 print(train_data.shape) vector = index_to_vector(idx) vector = vector.reshape((1, 512)) (weights, proto_ind, _) = explainer.explain(vector, train_data, m=num_prototypes) weights = np.around(weights / np.sum(weights), 2) print() print("example: ", stanzas[idx]) print("prototypes with weights:") print() print() for i in range(num_prototypes): j = proto_ind[i] print(weights[i], stanzas[indices[j]]) all_indices = [idx] for i in range(num_prototypes): j = proto_ind[i] stanza_ind = indices[j] all_indices.append(stanza_ind) for l in all_indices: print() print(stanzas[l]) print("Predicted Label: ", predict_label(stanzas[l])) print("True Label: ", np.argmax(one_hot_labels[l]))