def do_iteration(uuid, testtype, data): start = time.time() print(f"Iteration@{uuid} started") model = eh.load_lstm_model( "/home/tomasmizera/school/diploma/src/raw-data/lstm-model-sigmoid") print(f"Iteration@{uuid} t1 {time.time() - start}") def _predict_proba_fn(_input): """ Function accepting array of instances and returns a probability for each class _input - 1d array of instances Returns 2d array of [num of instances] x [num of classes] with probabilities """ strt = time.time() prediction = model.predict(_input) outarr = np.append(prediction, 1 - prediction, axis=1) return outarr explainer = lime_text.LimeTextExplainer( class_names=['Positive', 'Negative']) print(f"Iteration@{uuid} t2 {time.time() - start}") compstart = time.time() maxn = len(data) if testtype == 'A': for i in range(maxn): explainer.explain_instance(data[i], _predict_proba_fn, num_features=100) elif testtype == 'B': for i in range(maxn): explainer = lime_text.LimeTextExplainer( class_names=['Positive', 'Negative']) explainer.explain_instance(data[i], _predict_proba_fn, num_features=100) else: raise TypeError("No such test type") compend = time.time() - compstart print( f'Iteration@{uuid} computation took {compend} secs, per iteration approx {compend/maxn}' )
def __init__(self, modelfn=None, classnames=None, language="english", explainer=None, summarizer=None, fm=962, topfeaturescount=100, sentencescount=6, logger=None): self.fm = fm self.modelfn = modelfn self.classnames = classnames self.topfeaturescount = topfeaturescount self.language = language self.sentencescount = sentencescount if explainer is not None: self.explainer = explainer else: self.explainer = lime_text.LimeTextExplainer( class_names=self.classnames) if summarizer is not None: self.summarizer = summarizer else: self.summarizer = TextRankSummarizer(Stemmer(self.language)) self.summarizer.stop_words = get_stop_words(self.language) if logger is not None: self.log = logger else: self.log = logging.getLogger()
def lime_lyrics(lyrics, verbose=True): ''' Funtion that computes output on billboard.html ''' print(repr(lyrics)) print('____________________' + lyrics + '____________________') with open('model.pkl', 'rb') as fp: model = pickle.load(fp) with open('counter.pkl', 'rb') as fp: counter = pickle.load(fp) if verbose: # one bug of lime not fixed yet # sklearn 0.20.0 tokenizer = lambda doc: re.compile(r"(?u)\b\w\w+\b").findall(doc) #raise ValueError('Not implemented') pipe = make_pipeline(counter, model) class_names = ['Not on billboard', 'On billboard'] explainer = lime_text.LimeTextExplainer(class_names=class_names, split_expression=tokenizer) exp = explainer.explain_instance(lyrics, pipe.predict_proba, num_features=12) return exp else: inst = counter.transform([lyrics]) pred = model.predict_proba(inst) return pred
def __init__(self, *argv, **kwargs): """ Initialize lime text explainer object. """ super(LimeTextExplainer, self).__init__(*argv, **kwargs) self.explainer = lime_text.LimeTextExplainer(*argv, **kwargs)
def precompute_explanations(data, cnames, modelpath, vectorizerpath, outpath, label, workerid): explanator = lime_text.LimeTextExplainer(class_names=cnames) print(f'Worker @{workerid} started operating') model = eh.load_pickle_object(modelpath) assert(model is not None) vectorizer = eh.load_pickle_object(vectorizerpath) assert(vectorizer is not None) def _predict_proba_fn(_input): """ Function accepting array of instances and returns a probability for each class _input - 1d array of instances Returns 2d array of [num of instances] x [num of classes] with probabilities """ return model.predict_proba(vectorizer.transform(_input)) out = [] for i in range(len(data)): explanation = explanator.explain_instance(data[i][0], _predict_proba_fn, num_features=100) out.append((label, data[i][1], explanation, data[i][0])) with open(os.path.join(outpath, f'{workerid}.pickle'), 'wb') as fout: pickle.dump(out, fout) print(f"Worker @{workerid} precomputed {len(data)} instances with label {label}") return workerid
def explainInput(): # get text 2 from first form and set to lower case text = request.form['text2'].lower() explainer = lt.LimeTextExplainer(kernel_width=25, verbose=True, class_names=["positive", "negative"], feature_selection="lasso_path", split_expression=" ", bow=False) # still super hacky implementation in projectlib, yet running exp = explainer.explain_instance(text_instance=text, labels=[0, 1], classifier_fn=pl.predictFromText, num_features=5, num_samples=1000) htmlResult = exp.as_html(labels=[1], predict_proba=True, show_predicted_value=True) # add home button to end of file htmlResult = htmlResult.replace( "</body></html>", "<button type=\"button\" onclick=\"window.location.href=\'/home\';\">Home</button> \n </body></html>" ) return htmlResult
def lime_lyrics2(lyrics): lyrics = lyrics.replace('\n', '') with open('model.pkl', 'rb') as fp: model = pickle.load(fp) with open('counter.pkl', 'rb') as fp: counter = pickle.load(fp) pipe = make_pipeline(counter, model) class_names = ['Not on billboard', 'On billboard'] explainer = lime_text.LimeTextExplainer(class_names=class_names) exp = explainer\ .explain_instance(lyrics, pipe.predict_proba, num_features = 12) return exp
def explain_by_lime_idf(data: List[str], get_idf) -> List[Tuple[str, float]]: stemmer = CacheStemmer() def split(t): return t.split() explainer = lime_text.LimeTextExplainer(split_expression=split, bow=True) def evaluate_score(problems: List[str]): scores = [] for problem in problems: score = solve(problem) scores.append([0, score]) return np.array(scores) def solve(problem: str): tokens = split(problem) if "[SEP]" not in tokens: return 0 e: QueryDoc = parse_problem(tokens) q_terms = lmap(stemmer.stem, e.query) doc_terms = lmap(stemmer.stem, e.doc) tf = Counter(doc_terms) q_terms_set = set(q_terms) score = 0 for term, cnt in tf.items(): if term in q_terms_set: idf = get_idf(term) score += log(1 + cnt) * idf # TODO add idf multiplication return score explains = [] tick = TimeEstimator(len(data)) for entry in data: assert type(entry) == str exp = explainer.explain_instance(entry, evaluate_score, num_features=512) # l = list(exp.local_exp[1]) # l.sort(key=get_first) # indices, scores = zip(*l) l2 = exp.as_list() l2.sort(key=get_second, reverse=True) explains.append(l2) tick.tick() return explains
def precompute_explanations(*, data, cnames, modeltype, modelpath, exp_filter, outpath, workerid, partid): """ data must be map of id:(string, int) data ~ list of tuples (instance, label) """ explanator = lime_text.LimeTextExplainer(class_names=cnames) print(f"Worker @{workerid} started precomputing {len(data)} instances, ef {exp_filter} part {partid}") if modeltype == "lstm": with FileLock(os.path.join(modelpath, "iolock.lock")): model = eh.load_lstm_model(os.path.expanduser(modelpath)) elif modeltype == "svm-lime": model, vectorizer = eh.load_religion_model(modelpath) else: raise ValueError("Unknown model! " + modeltype) def _predict_proba_fn(_input): """ Function accepting array of instances and returns a probability for each class _input - 1d array of instances Returns 2d array of [num of instances] x [num of classes] with probabilities """ if modeltype == "svm-lime": return model.predict_proba(vectorizer.transform(_input)) elif modeltype == "lstm-imdb": prediction = model.predict(_input) return np.append(prediction, 1 - prediction, axis=1) assert (type(exp_filter) is int) out = [] for text, label in data: explanation = explanator.explain_instance(text, _predict_proba_fn, num_features=exp_filter) out.append((text, explanation, label)) outdir = os.path.join(outpath, f"expf:{exp_filter}") if not os.path.exists(outdir): os.makedirs(outdir, exist_ok=True) with open(os.path.join(outdir, f'filter:{exp_filter}-part:{partid}.pickle'), 'wb') as fout: pickle.dump(out, fout) print(f"Worker @{workerid} precomputed {len(data)} instances")
def predict(text): """ Input: text ------------------------------- Output: - category_str: predicted category - scores: probability score for the predicted category - score_pred: prob scores for all class in array of size [6,1] - viz: lime generated html for visualization """ category_dict = {0:'sport',1:'business',2:'tech', \ 3:'entertainment',4:'politics',5:'food'} category_names = [ 'sport', 'business', 'tech', 'entertainment', 'politics', 'food' ] ## Recovering vectorizer and model model_path = './classifier/serving/model/model_v2.joblib' model = joblib.load(model_path) ## Predict Category and Probability Score category_pred = model.predict([text]) score_pred = model.predict_proba([text]) category_str = [category_dict.get(pred) for pred in category_pred][0] scores = np.max(score_pred) ## Create Lime Explanation HTML explainer = lime_text.LimeTextExplainer(class_names=category_names) explained = explainer.explain_instance(text, model.predict_proba, top_labels=3, num_features=10) viz = explained.as_html(text=False, predict_proba=True) return category_str, scores, score_pred, viz
def run( self, inputs: List[JsonDict], model: lit_model.Model, dataset: lit_dataset.Dataset, model_outputs: Optional[List[JsonDict]] = None, config: Optional[JsonDict] = None, kernel_width: int = 25, # TODO(lit-dev): make configurable in UI. mask_string: str = '[MASK]', # TODO(lit-dev): make configurable in UI. num_samples: int = 256, # TODO(lit-dev): make configurable in UI. ) -> Optional[List[JsonDict]]: """Run this component, given a model and input(s).""" # Find keys of input (text) segments to explain. # Search in the input spec, since it's only useful to look at ones that are # used by the model. text_keys = utils.find_spec_keys(model.input_spec(), types.TextSegment) if not text_keys: logging.warning('LIME requires text inputs.') return None logging.info('Found text fields for LIME attribution: %s', str(text_keys)) # Find the key of output probabilities field(s). pred_keys = utils.find_spec_keys(model.output_spec(), types.MulticlassPreds) if not pred_keys: logging.warning( 'LIME did not find a multi-class predictions field.') return None pred_key = pred_keys[ 0] # TODO(lit-dev): configure which prob field to use. pred_spec = cast(types.MulticlassPreds, model.output_spec()[pred_key]) label_names = pred_spec.vocab # Create a LIME text explainer instance. explainer = lime_text.LimeTextExplainer( class_names=label_names, split_expression=str.split, kernel_width=kernel_width, mask_string=mask_string, # This is the string used to mask words. bow=False ) # bow=False masks inputs, instead of deleting them entirely. all_results = [] # Explain each input. for input_ in inputs: # Dict[field name -> interpretations] result = {} # Explain each text segment in the input, keeping the others constant. for text_key in text_keys: input_string = input_[text_key] logging.info('Explaining: %s', input_string) # Use the number of words as the number of features. num_features = len(input_string.split()) def _predict_proba(strings: List[Text]): """Given raw strings, return probabilities. Used by `explainer`.""" input_examples = [ new_example(input_, text_key, s) for s in strings ] model_outputs = model.predict(input_examples) probs = np.array( [output[pred_key] for output in model_outputs]) return probs # <float32>[len(strings), num_labels] # Perturbs the input string, gets model predictions, fits linear model. explanation = explainer.explain_instance( input_string, _predict_proba, num_features=num_features, num_samples=num_samples) # Turn the LIME explanation into a list following original word order. scores = explanation_to_array(explanation) result[text_key] = dtypes.SalienceMap(input_string.split(), scores) all_results.append(result) return all_results
# Reading from index number up to index-1. i.e. value at last index is not copied (e.g. for instance =1, reading happens from 0 to n_samples-1) seg_buffer = audio_buffer[(instance_idx - 1) * n_samples:(instance_idx - 1) * n_samples + n_samples] # save the instance #librosa.output.write_wav('input_audio_instance.wav', seg_buffer, SR) ss_buffer = [] # creating a list of all temporal segmentations (ndarrays) for i in range(0, n_ss): ss_buffer.append(seg_buffer[i * n_samples_ss:n_samples_ss * (i + 1)]) # Using LIME/ Sound-LIME to generate temporal explanations class_names = ['music', 'singing'] explainer = lime_text.LimeTextExplainer(class_names=class_names, verbose=True) exp = explainer.explain_instance(ss_buffer, clf.predict_proba, num_features=3, num_samples=1000, mean=mean, stddev=std) # generating explanations for 'singing voice' class exp_temporal_label_1 = exp.as_list(label=1) print() print('True class: %s' % class_names[int(y_testing[0][instance_idx - 1])]) print('Predicted class:%s' % class_names[class_pred]) print('Prediction confidence: %f' % prob)
def test_explain_matches_original_lime(self, sentence, num_samples, num_classes, class_to_explain): """Tests if Citrus LIME matches the original implementation.""" # Assign some weight to each token a-z. # Each token contributes positively/negatively to the prediction. rs = np.random.RandomState(seed=0) token_weights = {token: rs.normal() for token in sentence.split()} token_weights[lime.DEFAULT_MASK_TOKEN] = 0. def _predict_fn(sentences): """Mock prediction function.""" rs = np.random.RandomState(seed=0) predictions = [] for sentence in sentences: probs = rs.normal(0., 0.1, size=num_classes) # To check if LIME finds the right positive/negative correlations. for token in sentence.split(): probs[class_to_explain] += token_weights[token] predictions.append(probs) return np.stack(predictions, axis=0) # Explain the prediction using Citrus LIME. explanation = lime.explain( sentence, _predict_fn, class_to_explain=class_to_explain, num_samples=num_samples, tokenizer=str.split, mask_token=lime.DEFAULT_MASK_TOKEN, kernel=functools.partial( lime.exponential_kernel, kernel_width=lime.DEFAULT_KERNEL_WIDTH)) scores = explanation.feature_importance # <float32>[seq_len] scores = utils.normalize_scores(scores, make_positive=False) # Explain the prediction using original LIME. original_lime_explainer = lime_text.LimeTextExplainer( class_names=map(str, np.arange(num_classes)), mask_string=lime.DEFAULT_MASK_TOKEN, kernel_width=lime.DEFAULT_KERNEL_WIDTH, split_expression=str.split, bow=False) num_features = len(sentence.split()) original_explanation = original_lime_explainer.explain_instance( sentence, _predict_fn, labels=(class_to_explain,), num_features=num_features, num_samples=num_samples) # original_explanation.local_exp is a dict that has a key class_to_explain, # which gives a sequence of (index, score) pairs. # We convert it to an array <float32>[seq_len] with a score per position. original_scores = np.zeros(num_features) for index, score in original_explanation.local_exp[class_to_explain]: original_scores[index] = score original_scores = utils.normalize_scores( original_scores, make_positive=False) # Test that Citrus LIME and original LIME match. np.testing.assert_allclose(scores, original_scores, atol=0.01)
predStorage.append(pred) # convert to dxk ndarray return (np.hstack(predStorage).reshape(-1, 2)) # this works, yields an array with probabilities for both classes #print(predictFromText(textInputList = listTexts)) #print(predictFromText(textInputList=inputText)) # Lime Explainer # bow controls if words are perturbed or overwritten with UNKWORDZ # False makes sense, if location of words is important as in this classifier explainer = lt.LimeTextExplainer(kernel_width=25, verbose=True, class_names=["positive", "negative"], feature_selection="highest_weights", split_expression=" ", bow=False) print("yo") exp = explainer.explain_instance(text_instance=inputText, labels=[0, 1], classifier_fn=predictFromText, num_features=8, num_samples=5000) print(exp) html = exp.as_html(labels=[0, 1], predict_proba=True, show_predicted_value=True)
def explain_by_lime_notag(data, forward_run): x0, x1, x2 = data[0] len_seq = len(x0) def split(s): return s.split() explainer = lime_text.LimeTextExplainer(split_expression=split, bow=False) token_map = {} token_idx = 3 def forward_wrap(entry): nonlocal token_idx x0, x1, x2 = entry virtual_tokens = [] for loc in range(len_seq): rt = x0[loc], x1[loc], x2[loc] if rt in token_map: vt = token_map[rt] else: token_map[rt] = token_idx vt = token_idx token_idx = token_idx + 1 virtual_tokens.append(str(vt)) return " ".join(virtual_tokens) print("Virtualizing data") v_data = list([forward_wrap(e) for e in data]) rev_token_map = dict_reverse(token_map) def virtual_forward_run(vtokens_vector): def reform(t): if t == 'UNKWORDZ': return 2 else: return int(t) new_inputs = [] for vstr in vtokens_vector: x0 = [] x1 = [] x2 = [] vtokens = [reform(t) for t in vstr.split()] for token_idx in vtokens: if token_idx == 2: a = OOV_ID b = x1[-1] if x1 else 0 c = x2[-1] if x1 else 1 else: a, b, c = rev_token_map[token_idx] x0.append(a) x1.append(b) x2.append(c) new_inputs.append((x0, x1, x2)) return forward_run(new_inputs) explains = [] print("running lime") tick = TimeEstimator(len(v_data)) for entry in v_data: exp = explainer.explain_instance(entry, virtual_forward_run, num_features=len_seq) _, scores = zip(*list(exp.local_exp[0])) explains.append(scores) tick.tick() return explains