def predict(self, string: str, get_proba: bool = False, add_neutral: bool = True): """ Classify a string. Parameters ---------- string : str get_proba: bool, optional (default=False) If True, it will return probability of classes. add_neutral: bool, optional (default=True) if True, it will add neutral probability. Returns ------- string: result """ if add_neutral: label = self._label + ['neutral'] else: label = self._label vectors = self._vectorize.transform([self._cleaning(string)]) result = self._multinomial.predict_proba(vectors) if add_neutral: result = neutral(result) result = result[0] if get_proba: return {label[i]: result[i] for i in range(len(result))} else: return label[np.argmax(result)]
def _predict(self, strings, add_neutral=False): results = self._classify(strings) if add_neutral: result = neutral(results) label = self._label + ['neutral'] else: label = self._label return [label[result] for result in np.argmax(results, axis=1)]
def _predict_proba(self, strings, add_neutral=False): results = self._classify(strings) if add_neutral: results = neutral(results) label = self._label + ['neutral'] else: label = self._label outputs = [] for result in results: outputs.append({label[i]: result[i] for i in range(len(result))}) return outputs
def _predict(self, strings, add_neutral): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) result = self._sess.run( self._softmax, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if add_neutral: result = neutral(result) return result
def predict_batch( self, strings: List[str], get_proba: bool = False, add_neutral: bool = True, ): """ Classify a list of strings. Parameters ---------- strings: List[str] get_proba: bool, optional (default=False) If True, it will return probability of classes. add_neutral: bool, optional (default=True) if True, it will add neutral probability. Returns ------- string: list of results """ if add_neutral: label = self._label + ['neutral'] else: label = self._label strings = [self._cleaning(string) for string in strings] vectors = self._vectorize.transform(strings) results = self._multinomial.predict_proba(vectors) if add_neutral: results = neutral(results) if get_proba: outputs = [] for result in results: outputs.append( {label[i]: result[i] for i in range(len(result))}) return outputs else: return [label[result] for result in np.argmax(results, axis=1)]
def _predict_words(self, string, method, visualization, add_neutral=False): method = method.lower() if method not in ['last', 'first', 'mean']: raise ValueError( "method not supported, only support 'last', 'first' and 'mean'" ) if add_neutral: label = self._label + ['neutral'] else: label = self._label batch_x, batch_mask, _, s_tokens = bert_tokenization( self._tokenizer, [string]) result, attentions, words = self._sess.run( [self._softmax, self._attns, self._softmax_seq], feed_dict={ self._X: batch_x, self._input_masks: batch_mask }, ) if method == 'first': cls_attn = list(attentions[0].values())[0][:, :, 0, :] if method == 'last': cls_attn = list(attentions[-1].values())[0][:, :, 0, :] if method == 'mean': combined_attentions = [] for a in attentions: combined_attentions.append(list(a.values())[0]) cls_attn = np.mean(combined_attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights words = words[0] if add_neutral: result = neutral(result) words = neutral(words) result = result[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0]))) for i in range(words.shape[1]): m = merge_sentencepiece_tokens(list(zip(s_tokens[0], words[:, i])), weighted=False) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': label, 'y': y_barplot} dict_result['class_name'] = self._class_name if visualization: render_dict[self._class_name](dict_result) else: return dict_result
def _predict_words(self, string, method, visualization, add_neutral=False): method = method.lower() if method not in ['last', 'first', 'mean']: raise ValueError( "method not supported, only support 'last', 'first' and 'mean'" ) if add_neutral: label = self._label + ['neutral'] else: label = self._label input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'attention', 'logits_seq'], ) result = softmax(r['logits'], axis=-1) words = softmax(r['logits_seq'], axis=-1) attentions = r['attention'] if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights words = words[0] if add_neutral: result = neutral(result) words = neutral(words) result = result[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])), model='xlnet') for i in range(words.shape[1]): m = merge_sentencepiece_tokens( list(zip(s_tokens[0], words[:, i])), weighted=False, model='xlnet', ) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': label, 'y': y_barplot} dict_result['class_name'] = self._class_name if visualization: render_dict[self._class_name](dict_result) else: return dict_result
def predict_words(self, string: str, method: str = 'last', visualization: bool = True): """ classify words. Parameters ---------- string : str method : str, optional (default='last') Attention layer supported. Allowed values: * ``'last'`` - attention from last layer. * ``'first'`` - attention from first layer. * ``'mean'`` - average attentions from all layers. visualization: bool, optional (default=True) If True, it will open the visualization dashboard. Returns ------- dictionary: results """ method = method.lower() if method not in ['last', 'first', 'mean']: raise Exception( "method not supported, only support 'last', 'first' and 'mean'" ) label = self._label + ['neutral'] batch_x, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) result, attentions, words = self._sess.run( [self._softmax, self._attns, self._softmax_seq], feed_dict={ self._X: batch_x, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights result = neutral(result) result = result[0] words = neutral(words[0]) weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0]))) for i in range(words.shape[1]): m = merge_sentencepiece_tokens(list(zip(s_tokens[0], words[:, i])), weighted=False) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': label, 'y': y_barplot} dict_result['class_name'] = self._class_name if visualization: _render_binary(dict_result) else: return dict_result