예제 #1
0
def bert_flat_embed_posts(posts, embed_dim, data_fold_path):
    posts_arr = np.zeros((len(posts), embed_dim))
    bc = ConcurrentBertClient()
    bert_batch_size = 64
    for ind in range(0, len(posts), bert_batch_size):
        end_ind = min(ind + bert_batch_size, len(posts))
        posts_arr[ind:end_ind, :] = bc.encode(posts[ind:end_ind])
    return posts_arr
예제 #2
0
def bert_embed_posts(posts, max_sent_cnt, embed_dim, data_fold_path):
    posts_arr = np.zeros((len(posts), max_sent_cnt, embed_dim))
    bc = ConcurrentBertClient()
    for ind, sentences in enumerate(posts):
        embeddings = bc.encode(sentences)
        l = min(max_sent_cnt, len(sentences))
        posts_arr[ind, :l, :] = embeddings[:l]
        if ind % 1000 == 0:
            print("batch %s of %s done" % (ind, len(posts)))
    return posts_arr
예제 #3
0
    def __init__(self, component_config=None):
        super(BertBase, self).__init__(component_config)
        from bert_serving.client import ConcurrentBertClient

        self.bert_client = ConcurrentBertClient(
            ip=self.component_config['ip'],
            port=int(self.component_config['port']),
            port_out=int(self.component_config['port_out']),
            show_server_config=self.component_config['port_out'],
            output_fmt=self.component_config['output_fmt'],
            check_version=self.component_config['check_version'],
            timeout=int(self.component_config['timeout']),
            identity=self.component_config['identity'])
예제 #4
0
class BertModel:
    def __init__(self):
        self.bc = ConcurrentBertClient(max_concurrency=128)
        # pass

    def predict(self, batch):
        batch_outputs = self.bc.encode(batch)
        return batch_outputs
예제 #5
0
 def __init__(self, component_config=None):
     super(BertVectorsFeaturizer, self).__init__(component_config)
     ip = self.component_config['ip']
     port = self.component_config['port']
     port_out = self.component_config['port_out']
     show_server_config = self.component_config['show_server_config']
     output_fmt = self.component_config['output_fmt']
     check_version = self.component_config['check_version']
     timeout = self.component_config['timeout']
     identity = self.component_config['identity']
     self.bc = ConcurrentBertClient(ip=ip,
                                    port=int(port),
                                    port_out=int(port_out),
                                    show_server_config=show_server_config,
                                    output_fmt=output_fmt,
                                    check_version=check_version,
                                    timeout=int(timeout),
                                    identity=identity)
예제 #6
0
    def create_flask_app(self):
        try:
            from flask import Flask, request
            from flask_compress import Compress
            from flask_cors import CORS
            from flask_json import FlaskJSON, as_json, JsonError
            from bert_serving.client import ConcurrentBertClient
        except ImportError:
            raise ImportError(
                'BertClient or Flask or its dependencies are not fully installed, '
                'they are required for serving HTTP requests.'
                'Please use "pip install -U bert-serving-server[http]" to install it.'
            )

        # support up to 10 concurrent HTTP requests
        bc = ConcurrentBertClient(max_concurrency=self.args.http_max_connect,
                                  port=self.args.port,
                                  port_out=self.args.port_out,
                                  output_fmt='list',
                                  ignore_all_checks=True)
        app = Flask(__name__)
        logger = set_logger(colored('PROXY', 'red'))

        @app.route('/status/server', methods=['GET'])
        @as_json
        def get_server_status():
            return bc.server_status

        @app.route('/status/client', methods=['GET'])
        @as_json
        def get_client_status():
            return bc.status

        @app.route('/encode', methods=['POST'])
        @as_json
        def encode_query():
            data = request.form if request.form else request.json
            try:
                logger.info('new request from %s' % request.remote_addr)
                return {
                    'id':
                    data['id'],
                    'result':
                    bc.encode(data['texts'],
                              is_tokenized=bool(data['is_tokenized'])
                              if 'is_tokenized' in data else False)
                }

            except Exception as e:
                logger.error('error when handling HTTP request', exc_info=True)
                raise JsonError(description=str(e), type=str(type(e).__name__))

        CORS(app, origins=self.args.cors)
        FlaskJSON(app)
        Compress().init_app(app)
        return app
예제 #7
0
    def initBertClient(cls, bert_clinet_config=None):
        if bert_clinet_config:
            component_config = component_config
        else:
            component_config = defaults

        ip = component_config['ip']
        port = component_config['port']
        port_out = component_config['port_out']
        show_server_config = component_config['show_server_config']
        output_fmt = component_config['output_fmt']
        check_version = component_config['check_version']
        timeout = component_config['timeout']
        identity = component_config['identity']

        return ConcurrentBertClient(ip=ip,
                                    port=int(port),
                                    port_out=int(port_out),
                                    show_server_config=show_server_config,
                                    output_fmt=output_fmt,
                                    check_version=check_version,
                                    timeout=int(timeout),
                                    identity=identity)
class BertVectorsFeaturizer(Featurizer):
    provides = ["text_features"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["numpy", "bert_serving"]

    def __init__(self, component_config=None):
        super(BertVectorsFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.bc = ConcurrentBertClient(ip=ip,
                                       port=int(port),
                                       port_out=int(port_out),
                                       show_server_config=show_server_config,
                                       output_fmt=output_fmt,
                                       check_version=check_version,
                                       timeout=int(timeout),
                                       identity=identity)

    def _get_message_text(self, message):
        all_tokens = []

        for msg in message:
            all_tokens.append(msg.text)

        bert_embedding = self.bc.encode(all_tokens, is_tokenized=False)

        return np.squeeze(bert_embedding)

    def train(self, training_data, cfg=None, **kwargs):
        batch_size = self.component_config['batch_size']

        epochs = len(training_data.intent_examples) // batch_size + \
            int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_idx = (ep + 1) * batch_size
            start_idx = ep * batch_size
            examples = training_data.intent_examples[start_idx:end_idx]
            tokens_text = self._get_message_text(examples)
            X = np.array(tokens_text)

            for i, example in enumerate(examples):
                if len(examples) > 1:
                    example.set(
                        "text_features",
                        self._combine_with_existing_text_features(
                            example, X[i]))
                else:
                    example.set(
                        "text_features",
                        self._combine_with_existing_text_features(example, X))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        message_text = self._get_message_text([message])

        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, message_text))

    @classmethod
    def load(
            cls,
            meta,
            model_dir=None,  # type: Text
            model_metadata=None,  # type: Metadata
            cached_component=None,  # type: Optional[Component]
            **kwargs  # type: **Any
    ):

        return cls(meta)
예제 #9
0
class BertVectorsFeaturizer(Featurizer):
    name = "bert_vectors_featurizer"

    provides = ["text_features"]

    requires = ["tokens"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["numpy", "bert_serving"]

    def __init__(self, component_config=None):
        super(BertVectorsFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.bc = ConcurrentBertClient(max_concurrency=20,
                                       ip=ip,
                                       port=port,
                                       port_out=port_out,
                                       show_server_config=show_server_config,
                                       output_fmt=output_fmt,
                                       check_version=check_version,
                                       timeout=timeout,
                                       identity=identity)

    @classmethod
    def create(cls, cfg):
        component_conf = cfg.for_component(cls.name, cls.defaults)
        return BertVectorsFeaturizer(component_conf)

    @staticmethod
    def _replace_number_blank(text):
        return re.sub(r'\b[0-9]+\b', '0', text).replace(' ', '')

    def _get_message_text(self, message):
        all_tokens = []

        for msg in message:
            all_tokens.append(msg.text)

        bert_embedding = self.bc.encode(all_tokens, is_tokenized=False)

        return np.squeeze(bert_embedding)

    def train(self, training_data, cfg=None, **kwargs):
        batch_size = self.component_config['batch_size']

        epochs = len(training_data.intent_examples) // batch_size + \
            int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_idx = (ep + 1) * batch_size
            start_idx = ep * batch_size
            examples = training_data.intent_examples[start_idx:end_idx]
            tokens_text = self._get_message_text(examples)
            X = np.array(tokens_text)

            for i, example in enumerate(examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(example, X[i]))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        start = time.time()
        message_text = self._get_message_text([message])

        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, message_text))
        end = time.time()
        logger.info("bert vectors featurizer time cost %.3f s" % (end - start))

    @classmethod
    def load(
            cls,
            model_dir=None,  # type: Text
            model_metadata=None,  # type: Metadata
            cached_component=None,  # type: Optional[Component]
            **kwargs  # type: **Any
    ):

        meta = model_metadata.for_component(cls.name)

        return cls(meta)
예제 #10
0
from bert_serving.client import ConcurrentBertClient
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate

os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable()[0])
tf.logging.set_verbosity(tf.logging.INFO)

train_fp = ['/data/cips/data/lab/data/dataset/final_all_data/exercise_contest/data_train.json']
eval_fp = ['/data/cips/data/lab/data/dataset/final_all_data/exercise_contest/data_test.json']

batch_size = 128
num_parallel_calls = 4
num_concurrent_clients = num_parallel_calls * 2  # should be at least greater than `num_parallel_calls`

bc = ConcurrentBertClient(port=5557, port_out=5558)

# hardcoded law_ids
laws = [184, 336, 314, 351, 224, 132, 158, 128, 223, 308, 341, 349, 382, 238, 369, 248, 266, 313, 127, 340, 288, 172,
        209, 243, 302, 200, 227, 155, 147, 143, 261, 124, 359, 343, 291, 241, 235, 367, 393, 274, 240, 269, 199, 119,
        246, 282, 133, 177, 170, 310, 364, 201, 312, 244, 357, 233, 236, 264, 225, 234, 328, 417, 151, 135, 136, 348,
        217, 168, 134, 237, 262, 150, 114, 196, 303, 191, 392, 226, 267, 272, 212, 353, 315, 205, 372, 215, 350, 275,
        385, 164, 338, 292, 159, 162, 333, 388, 356, 375, 326, 402, 397, 125, 395, 290, 176, 354, 185, 141, 279, 399,
        192, 383, 307, 295, 361, 286, 404, 390, 294, 115, 344, 268, 171, 117, 273, 193, 418, 220, 198, 231, 386, 363,
        346, 210, 270, 144, 347, 280, 281, 118, 122, 116, 360, 239, 228, 305, 130, 152, 389, 276, 213, 186, 413, 285,
        316, 245, 232, 175, 149, 263, 387, 283, 391, 211, 396, 352, 345, 258, 253, 163, 140, 293, 194, 342, 161, 358,
        271, 156, 260, 384, 153, 277, 214]

laws_str = [str(x) for x in laws]

예제 #11
0
class BertFeaturizer(Featurizer):
    provides = []

    requires = []

    defaults = {
        "ip": 'localhost',
        "port": '8125',
        "port_out": '5556',
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": False,
        "identity": None,
        "batch_size": 128
    }

    language_list = None

    def __init__(self, component_config):
        super(BertFeaturizer, self).__init__(component_config)
        ip = self.component_config['ip']
        port = self.component_config['port']
        port_out = self.component_config['port_out']
        show_server_config = self.component_config['show_server_config']
        output_fmt = self.component_config['output_fmt']
        check_version = self.component_config['check_version']
        timeout = self.component_config['timeout']
        identity = self.component_config['identity']
        self.concurrent_bertClient = ConcurrentBertClient(
            ip=ip,
            port=int(port),
            port_out=int(port_out),
            show_server_config=show_server_config,
            output_fmt=output_fmt,
            check_version=check_version,
            timeout=timeout,
            identity=identity,
            check_length=False)

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["numpy", "bert_serving"]

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Optional[Text] = None,
             model_metadata: Optional["Metadata"] = None,
             cached_component: Optional["Component"] = None,
             **kwargs: Any) -> "Component":
        return cls(meta)

    def _get_message_text(self, messages):
        # all_tokens = [message.data['tokens'] for message in messages]
        all_tokens = [list(jieba.cut(message.text)) for message in messages]
        bert_embedding = self.concurrent_bertClient.encode(all_tokens,
                                                           is_tokenized=True)
        return np.squeeze(bert_embedding)

    def train(self,
              training_data: TrainingData,
              cfg: RasaNLUModelConfig = None,
              **kwargs: Any) -> None:
        batch_size = self.component_config['batch_size']
        epochs = len(training_data.intent_examples) // batch_size + \
                  int(len(training_data.intent_examples) % batch_size > 0)

        for ep in tqdm(range(epochs), desc="Epochs"):
            end_index = (ep + 1) * batch_size
            start_index = ep * batch_size
            examples = training_data.intent_examples[start_index:end_index]
            tokens = self._get_message_text(examples)
            X = np.array(tokens)

            for index, example in enumerate(examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(
                        example, X[index]))

    def process(self, message: Message, **kwargs) -> None:
        features = self._get_message_text([message])
        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, features))
예제 #12
0
import os
import time

import GPUtil
import tensorflow as tf
from bert_serving.client import ConcurrentBertClient

os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable())

train_fp = ['/data/cips/data/larry-autoencoder/cail_0518/data_train.json']
batch_size = 256
num_parallel_calls = 4
num_concurrent_clients = 10  # should be greater than `num_parallel_calls`

# to support `num_parallel_calls` in tf.data, you need ConcurrentBertClient
bc = ConcurrentBertClient()


def get_encodes(x):
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    text = [s['fact'][-50:] for s in samples]
    features = bc.encode(text)
    labels = [0 for _ in text]
    return features, labels


data_node = (tf.compat.v1.data.make_one_shot_iterator(
    tf.data.TextLineDataset(train_fp).batch(batch_size).map(
        lambda x: tf.compat.v1.py_func(
            get_encodes, [x], [tf.float32, tf.int64], name='bert_client'),
예제 #13
0
 def __init__(self):
     self.bc = ConcurrentBertClient(max_concurrency=128)
예제 #14
0
class BertBase(ContribFeaturizer):
    # Notice: need be implemented in subclass
    provides = []

    # Notice: need be implemented in subclass
    name = ""

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["bert_serving"]

    def __init__(self, component_config=None):
        super(BertBase, self).__init__(component_config)
        from bert_serving.client import ConcurrentBertClient

        self.bert_client = ConcurrentBertClient(
            ip=self.component_config['ip'],
            port=int(self.component_config['port']),
            port_out=int(self.component_config['port_out']),
            show_server_config=self.component_config['port_out'],
            output_fmt=self.component_config['output_fmt'],
            check_version=self.component_config['check_version'],
            timeout=int(self.component_config['timeout']),
            identity=self.component_config['identity'])

    def _query_embedding_vector(self, message_list):
        text_list = [i.text for i in message_list]

        embedding_vector_list = self.bert_client.encode(text_list,
                                                        is_tokenized=False)

        return embedding_vector_list

    def train(self, training_data, cfg=None, **kwargs):
        batch_iterator = BatchingIterator(self.component_config['batch_size'])

        for batch_examples in batch_iterator(training_data.training_examples):
            self._do_process(batch_examples)

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        batch_example = [message]

        self._do_process(batch_example)

    def _do_process(self, batch_example):
        batch_feature = self._query_embedding_vector(batch_example)

        assert len(batch_example) == batch_feature.shape[
            0], "batch_example and first dim of batch_feature must have same size"

        for i, example in enumerate(batch_example):
            feature = batch_feature[i]

            self._set_feature(example, feature)

    def _set_feature(self, example, feature):
        raise NotImplementedError
예제 #15
0
from bert_serving.client import ConcurrentBertClient
import numpy as np
import time

bc = ConcurrentBertClient(ip='127.0.0.1', port=5555, port_out=5556)

num = 1
start = time.time()
lst = []

while num < 900:
    bert_embedding = bc.encode(['黄金手'], is_tokenized=False)

    # str1 = np.squeeze(bert_embedding)
    lst.append(bert_embedding)
    num = num + 1
end = time.time()
strMsg = "总共花费 %.3f s" % (end - start)
print(strMsg)
print(len(lst))
예제 #16
0
parser.add_argument(
    "--bert_port",
    type=int,
    default=5555,
    help="Port for pushing data from bert client to server",
)
parser.add_argument(
    "--bert_port_out",
    type=int,
    default=5556,
    help="Port for publishing results from bert server to client",
)
args = parser.parse_args()
print("Args: ", args)

bc = ConcurrentBertClient(port=args.bert_port, port_out=args.bert_port_out)

logging.info("BertClient initialized")

labels_fp = os.path.join(args.data_dir, "labels.txt")
assert os.path.isfile(labels_fp), f"No label file found at {labels_fp}"

train_fp = os.path.join(args.data_dir, "train.csv")
assert os.path.isfile(train_fp), f"No train file found at {train_fp}"

writer_fp = os.path.join(args.data_dir, "train.tfrecord")
# remove file if exists
try:
    os.remove(writer_fp)
except OSError:
    pass
예제 #17
0
parser.add_argument(
    "--bert_port",
    type=int,
    default=5555,
    help="Port for pushing data from bert client to server",
)
parser.add_argument(
    "--bert_port_out",
    type=int,
    default=5556,
    help="Port for publishing results from bert server to client",
)
args = parser.parse_args()
print("Args: ", args)

bc = ConcurrentBertClient(port=args.bert_port, port_out=args.bert_port_out)

logging.info("BertClient initialized")

json_path = os.path.join(args.model_dir, "params.json")
assert os.path.isfile(json_path), f"No configuration file found at {json_path}"

train_fp = os.path.join(args.data_dir, "train.tfrecord")
assert os.path.isfile(train_fp), f"No train file found at {train_fp}"
eval_fp = os.path.join(args.data_dir, "eval.tfrecord")
assert os.path.isfile(eval_fp), f"No validation file found at {eval_fp}"

params = params.Params(json_path)

model = model.StylometerModel(params)
예제 #18
0
import tensorflow as tf
from bert_serving.client import ConcurrentBertClient
from matplotlib import pyplot as plt
from matplotlib.pyplot import xticks

from plugin.quantizer.base_quantizer import PiecewiseQuantizer

os.environ['CUDA_VISIBLE_DEVICES'] = str(0)

train_fp = ['/data/cips/data/larry-autoencoder/cail_0518/data_train.json']
dev_fp = ['/data/cips/data/larry-autoencoder/cail_0518/data_valid.json']
num_parallel_calls = 4
num_bits = 4

bc = ConcurrentBertClient(port=5500, port_out=5501)


def get_encodes(x, shuffle=False):
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    texts = []
    for s in samples:
        t = s['fact']
        s_idx = random.randint(0, len(t) - 1) if shuffle else 0
        texts.append(t[s_idx: (s_idx + 40)])
    features = bc.encode(texts)
    return features


def get_ds(fp, batch_size=1024, shuffle=False, only_head=False):
예제 #19
0
class BertTextFeaturizer(Featurizer):
    provides = ["text_features"]

    defaults = {
        "ip": 'localhost',
        "port": 5555,
        "port_out": 5556,
        "show_server_config": False,
        "output_fmt": 'ndarray',
        "check_version": True,
        "timeout": 5000,
        "identity": None,
        "batch_size": 128
    }

    @classmethod
    def required_packages(cls):
        return ["bert_serving"]

    def __init__(self, component_config=None):
        super(BertTextFeaturizer, self).__init__(component_config)
        from bert_serving.client import ConcurrentBertClient

        self.bert_client = ConcurrentBertClient(
            ip=self.component_config['ip'],
            port=int(self.component_config['port']),
            port_out=int(self.component_config['port_out']),
            show_server_config=self.component_config['port_out'],
            output_fmt=self.component_config['output_fmt'],
            check_version=self.component_config['check_version'],
            timeout=int(self.component_config['timeout']),
            identity=self.component_config['identity'])

    def _query_embedding_vector(self, message_list):
        text_list = [i.text for i in message_list]

        embedding_vector_list = self.bert_client.encode(text_list,
                                                        is_tokenized=False)

        return np.squeeze(embedding_vector_list)

    def train(self, training_data, cfg=None, **kwargs):
        batch_iterator = BatchingIterator(self.component_config['batch_size'])

        for batch_examples in batch_iterator(training_data):
            embedding_vector_list = self._query_embedding_vector(
                batch_examples)

            for i, example in enumerate(batch_examples):
                example.set(
                    "text_features",
                    self._combine_with_existing_text_features(
                        example, embedding_vector_list[i]))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        embedding_vector = self._query_embedding_vector([message])

        text_features = self._combine_with_existing_text_features(
            message, embedding_vector)

        message.set("text_features", text_features)
예제 #20
0
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
# from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate

# os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable()[0])
tf.logging.set_verbosity(tf.logging.INFO)
logger = logging.getLogger(__name__)

if tf.test.is_gpu_available:
    logger.info('No GPU available. Using CPU for execution')
else:
    logger.info('GPU available.')

logger.info('Initialize bert client for LINE TYPE')
bc = ConcurrentBertClient(ip=os.getenv('BERT_SVC_URL', 'bertservice'),
                          port=5555,
                          port_out=5556)
logger.info('Initialize bert client for LINE TYPE completed')


def get_encodes1(x):
    global result
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    text = [s['text'] for s in samples]
    features = bc.encode(text)
    labels = [[str(s['linelabel'])] for s in samples]
    result = text
    # logger.debug(type(result))
    # logger.debug("features--encoded {}".format(result))
    # logger.debug("corresponding--labels {}".format(labels))
예제 #21
0
from bert_serving.client import ConcurrentBertClient
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate

os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable()[0])
tf.logging.set_verbosity(tf.logging.INFO)

train_fp = ['data_train.json']
eval_fp = ['data_test.json']

batch_size = 128
num_parallel_calls = 4
num_concurrent_clients = num_parallel_calls * 2  # should be at least greater than `num_parallel_calls`

bc = ConcurrentBertClient(port=5555, port_out=5556)

line_labels = ['experience', 'education', 'project', 'skills', 'awards', 'personal', 'discard']


def get_encodes1(x):
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    text = [s['text'] for s in samples]
    features = bc.encode(text)

    labels_est = [[str(s['linelabel'])] for s in samples]
    
    return features, labels_est

예제 #22
0
    def create_flask_app(self):
        try:
            from flask import Flask, request, Response

            from flask_compress import Compress
            from flask_cors import CORS
            from flask_json import FlaskJSON, as_json, JsonError
            from bert_serving.client import ConcurrentBertClient
        except ImportError:
            raise ImportError(
                'BertClient or Flask or its dependencies are not fully installed, '
                'they are required for serving HTTP requests.'
                'Please use "pip install -U bert-serving-server[http]" to install it.'
            )

        # support up to 10 concurrent HTTP requests
        bc = ConcurrentBertClient(max_concurrency=self.args.http_max_connect,
                                  port=self.args.port,
                                  port_out=self.args.port_out,
                                  output_fmt='list',
                                  ignore_all_checks=True)
        app = Flask(__name__)
        logger = set_logger(colored('PROXY', 'red'))

        @app.route('/status/server', methods=['GET'])
        @as_json
        def get_server_status():
            return bc.server_status

        @app.route('/status/client', methods=['GET'])
        @as_json
        def get_client_status():
            return bc.status

        @app.route('/encode', methods=['POST'])
        @as_json
        def encode_query():
            data = request.form if request.form else request.json
            try:
                logger.info('new request from %s' % request.remote_addr)
                return {
                    'id':
                    data['id'],
                    'result':
                    bc.encode(data['texts'],
                              is_tokenized=bool(data['is_tokenized'])
                              if 'is_tokenized' in data else False)
                }

            except Exception as e:
                logger.error('error when handling HTTP request', exc_info=True)
                raise JsonError(description=str(e), type=str(type(e).__name__))

        @app.route('/invocations', methods=['POST'])
        @as_json
        def invocations():
            """
                a copy from encode_query to serve sagemarker
            :return:
            """
            data = request.form if request.form else request.json
            try:
                logger.info('new request from %s' % request.remote_addr)
                return {
                    'id':
                    data['id'],
                    'result':
                    bc.encode(data['texts'],
                              is_tokenized=bool(data['is_tokenized'])
                              if 'is_tokenized' in data else False)
                }

            except Exception as e:
                logger.error('error when handling HTTP request', exc_info=True)
                raise JsonError(description=str(e), type=str(type(e).__name__))

        @app.route('/ping', methods=['GET'])
        def ping():
            """Determine if the container is working and healthy. In this sample container, we declare
            it healthy if we can load the model successfully."""

            health = bc is not None  # You can insert a health check here

            status = 200 if health else 404
            return Response(response='\n',
                            status=status,
                            mimetype='application/json')

        CORS(app, origins=self.args.cors)
        FlaskJSON(app)
        Compress().init_app(app)
        return app