def bert_flat_embed_posts(posts, embed_dim, data_fold_path): posts_arr = np.zeros((len(posts), embed_dim)) bc = ConcurrentBertClient() bert_batch_size = 64 for ind in range(0, len(posts), bert_batch_size): end_ind = min(ind + bert_batch_size, len(posts)) posts_arr[ind:end_ind, :] = bc.encode(posts[ind:end_ind]) return posts_arr
class BertModel: def __init__(self): self.bc = ConcurrentBertClient(max_concurrency=128) # pass def predict(self, batch): batch_outputs = self.bc.encode(batch) return batch_outputs
def bert_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path): posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim)) bc = ConcurrentBertClient() for ind, sentences in enumerate(posts): embeddings = bc.encode(sentences) l = min(max_sent_cnt, len(sentences)) posts_arr[ind, :l, :] = embeddings[:l] if ind % 1000 == 0: print("batch %s of %s done" % (ind, len(posts))) return posts_arr
class BertVectorsFeaturizer(Featurizer): name = "bert_vectors_featurizer" provides = ["text_features"] requires = ["tokens"] defaults = { "ip": 'localhost', "port": 5555, "port_out": 5556, "show_server_config": False, "output_fmt": 'ndarray', "check_version": True, "timeout": 5000, "identity": None, "batch_size": 128 } @classmethod def required_packages(cls): return ["numpy", "bert_serving"] def __init__(self, component_config=None): super(BertVectorsFeaturizer, self).__init__(component_config) ip = self.component_config['ip'] port = self.component_config['port'] port_out = self.component_config['port_out'] show_server_config = self.component_config['show_server_config'] output_fmt = self.component_config['output_fmt'] check_version = self.component_config['check_version'] timeout = self.component_config['timeout'] identity = self.component_config['identity'] self.bc = ConcurrentBertClient(max_concurrency=20, ip=ip, port=port, port_out=port_out, show_server_config=show_server_config, output_fmt=output_fmt, check_version=check_version, timeout=timeout, identity=identity) @classmethod def create(cls, cfg): component_conf = cfg.for_component(cls.name, cls.defaults) return BertVectorsFeaturizer(component_conf) @staticmethod def _replace_number_blank(text): return re.sub(r'\b[0-9]+\b', '0', text).replace(' ', '') def _get_message_text(self, message): all_tokens = [] for msg in message: all_tokens.append(msg.text) bert_embedding = self.bc.encode(all_tokens, is_tokenized=False) return np.squeeze(bert_embedding) def train(self, training_data, cfg=None, **kwargs): batch_size = self.component_config['batch_size'] epochs = len(training_data.intent_examples) // batch_size + \ int(len(training_data.intent_examples) % batch_size > 0) for ep in tqdm(range(epochs), desc="Epochs"): end_idx = (ep + 1) * batch_size start_idx = ep * batch_size examples = training_data.intent_examples[start_idx:end_idx] tokens_text = self._get_message_text(examples) X = np.array(tokens_text) for i, example in enumerate(examples): example.set( "text_features", self._combine_with_existing_text_features(example, X[i])) def process(self, message, **kwargs): # type: (Message, **Any) -> None start = time.time() message_text = self._get_message_text([message]) message.set( "text_features", self._combine_with_existing_text_features(message, message_text)) end = time.time() logger.info("bert vectors featurizer time cost %.3f s" % (end - start)) @classmethod def load( cls, model_dir=None, # type: Text model_metadata=None, # type: Metadata cached_component=None, # type: Optional[Component] **kwargs # type: **Any ): meta = model_metadata.for_component(cls.name) return cls(meta)
class BertFeaturizer(Featurizer): provides = [] requires = [] defaults = { "ip": 'localhost', "port": '8125', "port_out": '5556', "show_server_config": False, "output_fmt": 'ndarray', "check_version": False, "identity": None, "batch_size": 128 } language_list = None def __init__(self, component_config): super(BertFeaturizer, self).__init__(component_config) ip = self.component_config['ip'] port = self.component_config['port'] port_out = self.component_config['port_out'] show_server_config = self.component_config['show_server_config'] output_fmt = self.component_config['output_fmt'] check_version = self.component_config['check_version'] timeout = self.component_config['timeout'] identity = self.component_config['identity'] self.concurrent_bertClient = ConcurrentBertClient( ip=ip, port=int(port), port_out=int(port_out), show_server_config=show_server_config, output_fmt=output_fmt, check_version=check_version, timeout=timeout, identity=identity, check_length=False) @classmethod def required_packages(cls) -> List[Text]: return ["numpy", "bert_serving"] @classmethod def load(cls, meta: Dict[Text, Any], model_dir: Optional[Text] = None, model_metadata: Optional["Metadata"] = None, cached_component: Optional["Component"] = None, **kwargs: Any) -> "Component": return cls(meta) def _get_message_text(self, messages): # all_tokens = [message.data['tokens'] for message in messages] all_tokens = [list(jieba.cut(message.text)) for message in messages] bert_embedding = self.concurrent_bertClient.encode(all_tokens, is_tokenized=True) return np.squeeze(bert_embedding) def train(self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any) -> None: batch_size = self.component_config['batch_size'] epochs = len(training_data.intent_examples) // batch_size + \ int(len(training_data.intent_examples) % batch_size > 0) for ep in tqdm(range(epochs), desc="Epochs"): end_index = (ep + 1) * batch_size start_index = ep * batch_size examples = training_data.intent_examples[start_index:end_index] tokens = self._get_message_text(examples) X = np.array(tokens) for index, example in enumerate(examples): example.set( "text_features", self._combine_with_existing_text_features( example, X[index])) def process(self, message: Message, **kwargs) -> None: features = self._get_message_text([message]) message.set( "text_features", self._combine_with_existing_text_features(message, features))
class BertBase(ContribFeaturizer): # Notice: need be implemented in subclass provides = [] # Notice: need be implemented in subclass name = "" defaults = { "ip": 'localhost', "port": 5555, "port_out": 5556, "show_server_config": False, "output_fmt": 'ndarray', "check_version": True, "timeout": 5000, "identity": None, "batch_size": 128 } @classmethod def required_packages(cls): return ["bert_serving"] def __init__(self, component_config=None): super(BertBase, self).__init__(component_config) from bert_serving.client import ConcurrentBertClient self.bert_client = ConcurrentBertClient( ip=self.component_config['ip'], port=int(self.component_config['port']), port_out=int(self.component_config['port_out']), show_server_config=self.component_config['port_out'], output_fmt=self.component_config['output_fmt'], check_version=self.component_config['check_version'], timeout=int(self.component_config['timeout']), identity=self.component_config['identity']) def _query_embedding_vector(self, message_list): text_list = [i.text for i in message_list] embedding_vector_list = self.bert_client.encode(text_list, is_tokenized=False) return embedding_vector_list def train(self, training_data, cfg=None, **kwargs): batch_iterator = BatchingIterator(self.component_config['batch_size']) for batch_examples in batch_iterator(training_data.training_examples): self._do_process(batch_examples) def process(self, message, **kwargs): # type: (Message, **Any) -> None batch_example = [message] self._do_process(batch_example) def _do_process(self, batch_example): batch_feature = self._query_embedding_vector(batch_example) assert len(batch_example) == batch_feature.shape[ 0], "batch_example and first dim of batch_feature must have same size" for i, example in enumerate(batch_example): feature = batch_feature[i] self._set_feature(example, feature) def _set_feature(self, example, feature): raise NotImplementedError
class BertVectorsFeaturizer(Featurizer): provides = ["text_features"] defaults = { "ip": 'localhost', "port": 5555, "port_out": 5556, "show_server_config": False, "output_fmt": 'ndarray', "check_version": True, "timeout": 5000, "identity": None, "batch_size": 128 } @classmethod def required_packages(cls): return ["numpy", "bert_serving"] def __init__(self, component_config=None): super(BertVectorsFeaturizer, self).__init__(component_config) ip = self.component_config['ip'] port = self.component_config['port'] port_out = self.component_config['port_out'] show_server_config = self.component_config['show_server_config'] output_fmt = self.component_config['output_fmt'] check_version = self.component_config['check_version'] timeout = self.component_config['timeout'] identity = self.component_config['identity'] self.bc = ConcurrentBertClient(ip=ip, port=int(port), port_out=int(port_out), show_server_config=show_server_config, output_fmt=output_fmt, check_version=check_version, timeout=int(timeout), identity=identity) def _get_message_text(self, message): all_tokens = [] for msg in message: all_tokens.append(msg.text) bert_embedding = self.bc.encode(all_tokens, is_tokenized=False) return np.squeeze(bert_embedding) def train(self, training_data, cfg=None, **kwargs): batch_size = self.component_config['batch_size'] epochs = len(training_data.intent_examples) // batch_size + \ int(len(training_data.intent_examples) % batch_size > 0) for ep in tqdm(range(epochs), desc="Epochs"): end_idx = (ep + 1) * batch_size start_idx = ep * batch_size examples = training_data.intent_examples[start_idx:end_idx] tokens_text = self._get_message_text(examples) X = np.array(tokens_text) for i, example in enumerate(examples): if len(examples) > 1: example.set( "text_features", self._combine_with_existing_text_features( example, X[i])) else: example.set( "text_features", self._combine_with_existing_text_features(example, X)) def process(self, message, **kwargs): # type: (Message, **Any) -> None message_text = self._get_message_text([message]) message.set( "text_features", self._combine_with_existing_text_features(message, message_text)) @classmethod def load( cls, meta, model_dir=None, # type: Text model_metadata=None, # type: Metadata cached_component=None, # type: Optional[Component] **kwargs # type: **Any ): return cls(meta)
class BertTextFeaturizer(Featurizer): provides = ["text_features"] defaults = { "ip": 'localhost', "port": 5555, "port_out": 5556, "show_server_config": False, "output_fmt": 'ndarray', "check_version": True, "timeout": 5000, "identity": None, "batch_size": 128 } @classmethod def required_packages(cls): return ["bert_serving"] def __init__(self, component_config=None): super(BertTextFeaturizer, self).__init__(component_config) from bert_serving.client import ConcurrentBertClient self.bert_client = ConcurrentBertClient( ip=self.component_config['ip'], port=int(self.component_config['port']), port_out=int(self.component_config['port_out']), show_server_config=self.component_config['port_out'], output_fmt=self.component_config['output_fmt'], check_version=self.component_config['check_version'], timeout=int(self.component_config['timeout']), identity=self.component_config['identity']) def _query_embedding_vector(self, message_list): text_list = [i.text for i in message_list] embedding_vector_list = self.bert_client.encode(text_list, is_tokenized=False) return np.squeeze(embedding_vector_list) def train(self, training_data, cfg=None, **kwargs): batch_iterator = BatchingIterator(self.component_config['batch_size']) for batch_examples in batch_iterator(training_data): embedding_vector_list = self._query_embedding_vector( batch_examples) for i, example in enumerate(batch_examples): example.set( "text_features", self._combine_with_existing_text_features( example, embedding_vector_list[i])) def process(self, message, **kwargs): # type: (Message, **Any) -> None embedding_vector = self._query_embedding_vector([message]) text_features = self._combine_with_existing_text_features( message, embedding_vector) message.set("text_features", text_features)
with open(labels_fp) as f: lines = f.read().splitlines() encoder.fit(lines) # write to tfrecord with tf.io.TFRecordWriter(writer_fp) as writer, tqdm.tqdm() as pbar: def create_float_feature(values): # numpy.ndarray return tf.train.Feature(float_list=tf.train.FloatList(value=values)) def create_int_feature(values): # list return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) with open(train_fp) as csvfile: csv_reader = csv.reader(csvfile, delimiter=",") next(csv_reader, None) # skip the headers for row in csv_reader: vector = bc.encode([row[1].strip()]) label = encoder.transform([row[2]]) features = { "features": create_float_feature(np.squeeze(vector)), "labels": create_int_feature(np.squeeze(label)), } tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString()) pbar.update(1)
from bert_serving.client import ConcurrentBertClient import numpy as np import time bc = ConcurrentBertClient(ip='127.0.0.1', port=5555, port_out=5556) num = 1 start = time.time() lst = [] while num < 900: bert_embedding = bc.encode(['黄金手'], is_tokenized=False) # str1 = np.squeeze(bert_embedding) lst.append(bert_embedding) num = num + 1 end = time.time() strMsg = "总共花费 %.3f s" % (end - start) print(strMsg) print(len(lst))