예제 #1
0
def bert_flat_embed_posts(posts, embed_dim, data_fold_path):
    posts_arr = np.zeros((len(posts), embed_dim))
    bc = ConcurrentBertClient()
    bert_batch_size = 64
    for ind in range(0, len(posts), bert_batch_size):
        end_ind = min(ind + bert_batch_size, len(posts))
        posts_arr[ind:end_ind, :] = bc.encode(posts[ind:end_ind])
    return posts_arr
예제 #2
0
class BertModel:
    def __init__(self):
        self.bc = ConcurrentBertClient(max_concurrency=128)
        # pass

    def predict(self, batch):
        batch_outputs = self.bc.encode(batch)
        return batch_outputs
예제 #3
0
def bert_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path):
    posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim))
    bc = ConcurrentBertClient()
    for ind, sentences in enumerate(posts):
        embeddings = bc.encode(sentences)
        l = min(max_sent_cnt, len(sentences))
        posts_arr[ind, :l, :] = embeddings[:l]
        if ind % 1000 == 0:
            print("batch %s of %s done" % (ind, len(posts)))
    return posts_arr
예제 #4
0
class BertVectorsFeaturizer(Featurizer):
    name = "bert_vectors_featurizer"

    provides = ["text_features"]

    requires = ["tokens"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["numpy", "bert_serving"]

    def __init__(self, component_config=None):
        super(BertVectorsFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.bc = ConcurrentBertClient(max_concurrency=20,
                                       ip=ip,
                                       port=port,
                                       port_out=port_out,
                                       show_server_config=show_server_config,
                                       output_fmt=output_fmt,
                                       check_version=check_version,
                                       timeout=timeout,
                                       identity=identity)

    @classmethod
    def create(cls, cfg):
        component_conf = cfg.for_component(cls.name, cls.defaults)
        return BertVectorsFeaturizer(component_conf)

    @staticmethod
    def _replace_number_blank(text):
        return re.sub(r'\b[0-9]+\b', '0', text).replace(' ', '')

    def _get_message_text(self, message):
        all_tokens = []

        for msg in message:
            all_tokens.append(msg.text)

        bert_embedding = self.bc.encode(all_tokens, is_tokenized=False)

        return np.squeeze(bert_embedding)

    def train(self, training_data, cfg=None, **kwargs):
        batch_size = self.component_config['batch_size']

        epochs = len(training_data.intent_examples) // batch_size + \
            int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_idx = (ep + 1) * batch_size
            start_idx = ep * batch_size
            examples = training_data.intent_examples[start_idx:end_idx]
            tokens_text = self._get_message_text(examples)
            X = np.array(tokens_text)

            for i, example in enumerate(examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(example, X[i]))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        start = time.time()
        message_text = self._get_message_text([message])

        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, message_text))
        end = time.time()
        logger.info("bert vectors featurizer time cost %.3f s" % (end - start))

    @classmethod
    def load(
            cls,
            model_dir=None,  # type: Text
            model_metadata=None,  # type: Metadata
            cached_component=None,  # type: Optional[Component]
            **kwargs  # type: **Any
    ):

        meta = model_metadata.for_component(cls.name)

        return cls(meta)
예제 #5
0
class BertFeaturizer(Featurizer):
    provides = []

    requires = []

    defaults = {
        "ip": 'localhost',
        "port": '8125',
        "port_out": '5556',
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": False,
        "identity": None,
        "batch_size": 128
    }

    language_list = None

    def __init__(self, component_config):
        super(BertFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.concurrent_bertClient = ConcurrentBertClient(
            ip=ip,
            port=int(port),
            port_out=int(port_out),
            show_server_config=show_server_config,
            output_fmt=output_fmt,
            check_version=check_version,
            timeout=timeout,
            identity=identity,
            check_length=False)

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["numpy", "bert_serving"]

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Optional[Text] = None,
             model_metadata: Optional["Metadata"] = None,
             cached_component: Optional["Component"] = None,
             **kwargs: Any) -> "Component":
        return cls(meta)

    def _get_message_text(self, messages):
        # all_tokens = [message.data['tokens'] for message in messages]
        all_tokens = [list(jieba.cut(message.text)) for message in messages]
        bert_embedding = self.concurrent_bertClient.encode(all_tokens,
                                                           is_tokenized=True)
        return np.squeeze(bert_embedding)

    def train(self,
              training_data: TrainingData,
              cfg: RasaNLUModelConfig = None,
              **kwargs: Any) -> None:
        batch_size = self.component_config['batch_size']
        epochs = len(training_data.intent_examples) // batch_size + \
                  int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_index = (ep + 1) * batch_size
            start_index = ep * batch_size
            examples = training_data.intent_examples[start_index:end_index]
            tokens = self._get_message_text(examples)
            X = np.array(tokens)

            for index, example in enumerate(examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(
                        example, X[index]))

    def process(self, message: Message, **kwargs) -> None:
        features = self._get_message_text([message])
        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, features))
예제 #6
0
class BertBase(ContribFeaturizer):
    # Notice: need be implemented in subclass
    provides = []

    # Notice: need be implemented in subclass
    name = ""

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["bert_serving"]

    def __init__(self, component_config=None):
        super(BertBase, self).__init__(component_config)
        from bert_serving.client import ConcurrentBertClient

        self.bert_client = ConcurrentBertClient(
            ip=self.component_config['ip'],
            port=int(self.component_config['port']),
            port_out=int(self.component_config['port_out']),
            show_server_config=self.component_config['port_out'],
            output_fmt=self.component_config['output_fmt'],
            check_version=self.component_config['check_version'],
            timeout=int(self.component_config['timeout']),
            identity=self.component_config['identity'])

    def _query_embedding_vector(self, message_list):
        text_list = [i.text for i in message_list]

        embedding_vector_list = self.bert_client.encode(text_list,
                                                        is_tokenized=False)

        return embedding_vector_list

    def train(self, training_data, cfg=None, **kwargs):
        batch_iterator = BatchingIterator(self.component_config['batch_size'])

        for batch_examples in batch_iterator(training_data.training_examples):
            self._do_process(batch_examples)

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        batch_example = [message]

        self._do_process(batch_example)

    def _do_process(self, batch_example):
        batch_feature = self._query_embedding_vector(batch_example)

        assert len(batch_example) == batch_feature.shape[
            0], "batch_example and first dim of batch_feature must have same size"

        for i, example in enumerate(batch_example):
            feature = batch_feature[i]

            self._set_feature(example, feature)

    def _set_feature(self, example, feature):
        raise NotImplementedError
class BertVectorsFeaturizer(Featurizer):
    provides = ["text_features"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["numpy", "bert_serving"]

    def __init__(self, component_config=None):
        super(BertVectorsFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.bc = ConcurrentBertClient(ip=ip,
                                       port=int(port),
                                       port_out=int(port_out),
                                       show_server_config=show_server_config,
                                       output_fmt=output_fmt,
                                       check_version=check_version,
                                       timeout=int(timeout),
                                       identity=identity)

    def _get_message_text(self, message):
        all_tokens = []

        for msg in message:
            all_tokens.append(msg.text)

        bert_embedding = self.bc.encode(all_tokens, is_tokenized=False)

        return np.squeeze(bert_embedding)

    def train(self, training_data, cfg=None, **kwargs):
        batch_size = self.component_config['batch_size']

        epochs = len(training_data.intent_examples) // batch_size + \
            int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_idx = (ep + 1) * batch_size
            start_idx = ep * batch_size
            examples = training_data.intent_examples[start_idx:end_idx]
            tokens_text = self._get_message_text(examples)
            X = np.array(tokens_text)

            for i, example in enumerate(examples):
                if len(examples) > 1:
                    example.set(
                        "text_features",
                        self._combine_with_existing_text_features(
                            example, X[i]))
                else:
                    example.set(
                        "text_features",
                        self._combine_with_existing_text_features(example, X))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        message_text = self._get_message_text([message])

        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, message_text))

    @classmethod
    def load(
            cls,
            meta,
            model_dir=None,  # type: Text
            model_metadata=None,  # type: Metadata
            cached_component=None,  # type: Optional[Component]
            **kwargs  # type: **Any
    ):

        return cls(meta)
예제 #8
0
class BertTextFeaturizer(Featurizer):
    provides = ["text_features"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["bert_serving"]

    def __init__(self, component_config=None):
        super(BertTextFeaturizer, self).__init__(component_config)
        from bert_serving.client import ConcurrentBertClient

        self.bert_client = ConcurrentBertClient(
            ip=self.component_config['ip'],
            port=int(self.component_config['port']),
            port_out=int(self.component_config['port_out']),
            show_server_config=self.component_config['port_out'],
            output_fmt=self.component_config['output_fmt'],
            check_version=self.component_config['check_version'],
            timeout=int(self.component_config['timeout']),
            identity=self.component_config['identity'])

    def _query_embedding_vector(self, message_list):
        text_list = [i.text for i in message_list]

        embedding_vector_list = self.bert_client.encode(text_list,
                                                        is_tokenized=False)

        return np.squeeze(embedding_vector_list)

    def train(self, training_data, cfg=None, **kwargs):
        batch_iterator = BatchingIterator(self.component_config['batch_size'])

        for batch_examples in batch_iterator(training_data):
            embedding_vector_list = self._query_embedding_vector(
                batch_examples)

            for i, example in enumerate(batch_examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(
                        example, embedding_vector_list[i]))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        embedding_vector = self._query_embedding_vector([message])

        text_features = self._combine_with_existing_text_features(
            message, embedding_vector)

        message.set("text_features", text_features)
예제 #9
0
with open(labels_fp) as f:
    lines = f.read().splitlines()
    encoder.fit(lines)

# write to tfrecord
with tf.io.TFRecordWriter(writer_fp) as writer, tqdm.tqdm() as pbar:

    def create_float_feature(values):  # numpy.ndarray
        return tf.train.Feature(float_list=tf.train.FloatList(value=values))

    def create_int_feature(values):  # list
        return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

    with open(train_fp) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=",")
        next(csv_reader, None)  # skip the headers

        for row in csv_reader:
            vector = bc.encode([row[1].strip()])
            label = encoder.transform([row[2]])

            features = {
                "features": create_float_feature(np.squeeze(vector)),
                "labels": create_int_feature(np.squeeze(label)),
            }
            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))
            writer.write(tf_example.SerializeToString())
            pbar.update(1)
예제 #10
0
from bert_serving.client import ConcurrentBertClient
import numpy as np
import time

bc = ConcurrentBertClient(ip='127.0.0.1', port=5555, port_out=5556)

num = 1
start = time.time()
lst = []

while num < 900:
    bert_embedding = bc.encode(['黄金手'], is_tokenized=False)

    # str1 = np.squeeze(bert_embedding)
    lst.append(bert_embedding)
    num = num + 1
end = time.time()
strMsg = "总共花费 %.3f s" % (end - start)
print(strMsg)
print(len(lst))