def _base(self, strings_left, strings_right): input_ids_left, input_masks_left, segment_ids_left, _ = xlnet_tokenization( self._tokenizer, strings_left) input_ids_right, input_masks_right, segment_ids_right, _ = xlnet_tokenization( self._tokenizer, strings_left) r = self._execute( inputs=[ input_ids_left, segment_ids_left, input_masks_left, input_ids_right, input_masks_right, segment_ids_right, ], input_labels=[ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', 'Placeholder_4', 'Placeholder_5', ], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _vectorize(self, strings, method='first'): method = method.lower() if method not in ['first', 'last', 'mean', 'word']: raise ValueError( "method not supported, only support 'first', 'last', 'mean' and 'word'" ) input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, strings) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['vectorizer'], ) v = r['vectorizer'] if method == 'first': v = v[:, 0] elif method == 'last': v = v[:, -1] elif method == 'mean': v = np.mean(v, axis=1) else: v = [ merge_sentencepiece_tokens( list(zip(s_tokens[i], v[i][:len(s_tokens[i])])), weighted=False, vectorize=True, model='xlnet', ) for i in range(len(v)) ] return v
def predict(self, string: str): """ Tag a string. Parameters ---------- string : str Returns ------- result : Tuple[str, str] """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) s_tokens = s_tokens[0] predicted = self._sess.run( self._logits, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, )[0] t = [self._settings['idx2tag'][d] for d in predicted] merged = merge_sentencepiece_tokens_tagging(s_tokens, t, model='xlnet') return list(zip(*merged))
def vectorize(self, strings): """ Vectorize string inputs using bert attention. Parameters ---------- strings : str / list of str Returns ------- array: vectorized strings """ if isinstance(strings, list): if not isinstance(strings[0], str): raise ValueError('input must be a list of strings or a string') else: if not isinstance(strings, str): raise ValueError('input must be a list of strings or a string') if isinstance(strings, str): strings = [strings] input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) return self._sess.run( self.logits, feed_dict={ self.X: input_ids, self.segment_ids: segment_ids, self.input_masks: input_masks, }, )
def _vectorize(self, strings, method='first'): method = method.lower() if method not in ['first', 'last', 'mean', 'word']: raise ValueError( "method not supported, only support 'first', 'last', 'mean' and 'word'" ) input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, strings) v = self._sess.run( self._vectorizer, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if method == 'first': v = v[:, 0] elif method == 'last': v = v[:, -1] elif method == 'mean': v = np.mean(v, axis=1) else: v = [ merge_sentencepiece_tokens( list(zip(s_tokens[i], v[i][:len(s_tokens[i])])), weighted=False, vectorize=True, model='xlnet', ) for i in range(len(v)) ] return v
def vectorize(self, string: str): """ vectorize a string. Parameters ---------- string: List[str] Returns ------- result: np.array """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) s_tokens = s_tokens[0] v = self._sess.run( self._vectorizer, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) v = v[0] return merge_sentencepiece_tokens( list(zip(s_tokens, v[:len(s_tokens)])), weighted=False, vectorize=True, model='xlnet', )
def vectorize(self, string: str): """ vectorize a string. Parameters ---------- string: List[str] Returns ------- result: np.array """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string], space_after_punct=True) s_tokens = s_tokens[0] r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['vectorizer'], ) v = r['vectorizer'] v = v[0] return merge_sentencepiece_tokens( list(zip(s_tokens, v[:len(s_tokens)])), weighted=False, vectorize=True, model='xlnet', )
def vectorize(self, strings: List[str]): """ Vectorize string inputs. Parameters ---------- strings : List[str] Returns ------- result: np.array """ input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings ) return self._sess.run( self.logits, feed_dict = { self.X: input_ids, self.segment_ids: segment_ids, self.input_masks: input_masks, }, )
def _tokenize(self, string): if self._tok: input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization_token( self._tokenizer, self._tok, [string]) else: input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string], space_after_punct=True) s_tokens = s_tokens[0] return input_ids, input_masks, segment_ids, s_tokens
def _classify(self, strings): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def predict(self, string: str): """ Tag a string. Parameters ---------- string : str Returns ------- result : Tuple """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string], space_after_punct=True) s_tokens = s_tokens[0] r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'heads_seq'], ) tagging, depend = r['logits'], r['heads_seq'] tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - self._minus for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(s_tokens, tagging, model='xlnet') tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(s_tokens, depend, model='xlnet') indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) elif (i + 1) == index: index = index + 1 elif index == -1: index = i indexing_.append((indexing[i][0], index)) result.append('%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1])) d = DependencyGraph('\n'.join(result), top_relation_label='root') return d, tagging, indexing_
def _classify(self, strings): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) return self._sess.run( self._softmax, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, )
def predict(self, string: str): """ Tag a string. Parameters ---------- string : str Returns ------- result : Tuple """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) s_tokens = s_tokens[0] tagging, depend = self._sess.run( [self._logits, self._heads_seq], feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - 1 for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(s_tokens, tagging, model='xlnet') tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(s_tokens, depend, model='xlnet') indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) indexing_.append((indexing[i][0], index)) result.append('%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1])) d = DependencyGraph('\n'.join(result), top_relation_label='root') return d, tagging, indexing_
def vectorize(self, strings: List[str], method: str = 'first'): """ vectorize list of strings. Parameters ---------- strings: List[str] method : str, optional (default='first') Vectorization layer supported. Allowed values: * ``'last'`` - vector from last sequence. * ``'first'`` - vector from first sequence. * ``'mean'`` - average vectors from all sequences. * ``'word'`` - average vectors based on tokens. Returns ------- result: np.array """ method = method.lower() if method not in ['first', 'last', 'mean', 'word']: raise ValueError( "method not supported, only support 'first', 'last', 'mean' and 'word'" ) input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, strings) v = self._sess.run( self._vectorizer, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if method == 'first': v = v[:, 0] elif method == 'last': v = v[:, -1] elif method == 'mean': v = np.mean(v, axis=1) else: v = [ merge_sentencepiece_tokens( list(zip(s_tokens[i], v[i][:len(s_tokens[i])])), weighted=False, vectorize=True, model='xlnet', ) for i in range(len(v)) ] return v
def _predict(self, strings): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) result = self._sess.run( self._sigmoid, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) return result
def _attention(self, strings): input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, strings) maxlen = max([len(s) for s in s_tokens]) s_tokens = padding_sequence(s_tokens, maxlen, pad_int='<cls>') attentions = self._sess.run( self.attention_nodes, feed_dict={ self.X: input_ids, self.segment_ids: segment_ids, self.input_masks: input_masks, }, ) return attentions, s_tokens, input_masks
def _predict(self, strings, add_neutral): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) result = self._sess.run( self._softmax, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if add_neutral: result = neutral(result) return result
def vectorize(self, strings: List[str]): """ Vectorize list of strings. Parameters ---------- strings : List[str] Returns ------- result: np.array """ input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['xlnet/summary'], ) return r['xlnet/summary']
def vectorize(self, strings: List[str]): """ Vectorize list of strings. Parameters ---------- strings : List[str] Returns ------- result: np.array """ input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) segment_ids = np.array(segment_ids) segment_ids[segment_ids == 0] = 1 return self._sess.run( self._vectorizer, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, )
def predict_words(self, string: str, method: str = 'last', visualization: bool = True): """ classify words. Parameters ---------- string : str method : str, optional (default='last') Attention layer supported. Allowed values: * ``'last'`` - attention from last layer. * ``'first'`` - attention from first layer. * ``'mean'`` - average attentions from all layers. visualization: bool, optional (default=True) If True, it will open the visualization dashboard. Returns ------- dictionary: results """ method = method.lower() if method not in ['last', 'first', 'mean']: raise Exception( "method not supported, only support 'last', 'first' and 'mean'" ) batch_x, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) result, attentions, words = self._sess.run( [self._softmax, self._attns, self._softmax_seq], feed_dict={ self._X: batch_x, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights result = result[0] words = words[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0]))) for i in range(words.shape[1]): m = merge_sentencepiece_tokens(list(zip(s_tokens[0], words[:, i])), weighted=False) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(self._label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {self._label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': self._label, 'y': y_barplot} dict_result['class_name'] = self._class_name if visualization: if self._class_name == 'relevancy': _render_relevancy(dict_result) else: _render_emotion(dict_result) else: return dict_result
def _predict_words(self, string, method, visualization, add_neutral=False): method = method.lower() if method not in ['last', 'first', 'mean']: raise ValueError( "method not supported, only support 'last', 'first' and 'mean'" ) if add_neutral: label = self._label + ['neutral'] else: label = self._label input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'attention', 'logits_seq'], ) result = softmax(r['logits'], axis=-1) words = softmax(r['logits_seq'], axis=-1) attentions = r['attention'] if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights words = words[0] if add_neutral: result = neutral(result) words = neutral(words) result = result[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])), model='xlnet') for i in range(words.shape[1]): m = merge_sentencepiece_tokens( list(zip(s_tokens[0], words[:, i])), weighted=False, model='xlnet', ) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': label, 'y': y_barplot} dict_result['module'] = self._module if visualization: render_dict[self._module](dict_result) else: return dict_result
def predict_words(self, string: str, method: str = 'last', visualization: bool = True): """ classify words. Parameters ---------- string : str method : str, optional (default='last') Attention layer supported. Allowed values: * ``'last'`` - attention from last layer. * ``'first'`` - attention from first layer. * ``'mean'`` - average attentions from all layers. visualization: bool, optional (default=True) If True, it will open the visualization dashboard. Returns ------- dictionary: results """ method = method.lower() if method not in ['last', 'first', 'mean']: raise ValueError( "method not supported, only support 'last', 'first' and 'mean'" ) input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'attention', 'logits_seq'], ) result = sigmoid(r['logits']) words = sigmoid(r['logits_seq']) attentions = r['attention'] if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights result = result[0] words = words[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])), model='xlnet') for i in range(words.shape[1]): m = merge_sentencepiece_tokens( list(zip(s_tokens[0], words[:, i])), weighted=False, model='xlnet', ) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(self._label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {self._label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': self._label, 'y': y_barplot} dict_result['module'] = self._module if visualization: _render_toxic(dict_result) else: return dict_result