Esempio n. 1
0
 def __init__(self, maxsize=32):
     self.input_queue = Queue(maxsize=maxsize)
     self.text2vec = BertEncode(graph_path=None)
     t = threading.Thread(target=self.run)
     t.setDaemon(True)
     t.start()
     logger.info("\033[1;32mbert initialize ok\033[0m")
Esempio n. 2
0
 def __init__(self, batch=10000, max_boxes_num=10):
     self.max_boxes_num = max_boxes_num
     self.batch = batch
     self.text2vec = BertEncode(graph_path=None)
     self.labels_dict = pd.read_csv(LABELS_NAMES_PATH, sep="\t").values[:,
                                                                        1]
     self.labels_vector_dict = self.text2vec.encode(self.labels_dict)
     self.train_datas = pd.read_csv(TRAIN_PATH,
                                    sep="\t",
                                    chunksize=self.batch)
Esempio n. 3
0
class Greeter(bert_server_pb2_grpc.BertServetServicer):
    def __init__(self):
        super().__init__()
        self.text2vec = BertEncode(graph_path=None)
        logger.info("\033[1;32mbert initialize ok\033[0m")

    def get_vectors(self, request, context):
        vectors = self.text2vec.encode(request.sentences)
        vectors = [bert_server_pb2.Vector(vector=vector) for vector in vectors]
        vectors = bert_server_pb2.Vectors(vectors=vectors)
        return vectors

    def get_vector(self, request, context):
        vector = self.text2vec.encode(request.sentence)[0]
        vector = bert_server_pb2.Vector(vector=vector)
        return vector
Esempio n. 4
0
class Greeter(bert_server_queue_pb2_grpc.BertServetServicer):
    def __init__(self):
        super().__init__()
        self.text2vec = BertEncode(graph_path=None)
        logger.info("\033[1;32mbert initialize ok\033[0m")

    def get_vectors(self, request, context):
        inputs = []
        tokens = []
        start = time.time()
        for texts in request:
            inputs.extend(texts.sentences)
            tokens.extend([texts.token] * len(texts.sentences))
            end = time.time()
            interval = end - start
            if len(inputs) > 32 or interval > 3:
                info = pd.DataFrame(np.array([tokens, inputs]).T,
                                    columns=["tokens", "inputs"
                                             ]).groupby("tokens").indices
                vectors = self.text2vec.encode(inputs)
                vectors = [
                    bert_server_queue_pb2.Vector(vector=vector)
                    for vector in vectors
                ]
                reply_vectors = [
                    bert_server_queue_pb2.Vectors(
                        token=k, vectors=[vectors[i] for i in v])
                    for k, v in info.items()
                ]
                reply_vectors = bert_server_queue_pb2.ReplyVectors(
                    reply=reply_vectors)
                yield reply_vectors
                inputs = []
                tokens = []
                start = time.time()

    def get_vector(self, request, context):
        vector = self.text2vec.encode(request.sentence)[0]
        vector = bert_server_queue_pb2.Vector(vector=vector)
        return vector
Esempio n. 5
0
class Dataset(object):
    def __init__(self, batch=10000, max_boxes_num=10):
        self.max_boxes_num = max_boxes_num
        self.batch = batch
        self.text2vec = BertEncode(graph_path=None)
        self.labels_dict = pd.read_csv(LABELS_NAMES_PATH, sep="\t").values[:,
                                                                           1]
        self.labels_vector_dict = self.text2vec.encode(self.labels_dict)
        self.train_datas = pd.read_csv(TRAIN_PATH,
                                       sep="\t",
                                       chunksize=self.batch)

    def deal_data(self, data, index):
        product_id = data.product_id.values
        image_h = data.image_h.values
        image_w = data.image_w.values
        num_boxes = data.num_boxes.values
        boxes = [
            np.frombuffer(base64.b64decode(boxe),
                          dtype=np.float32).reshape(num_boxe, 4)
            for num_boxe, boxe in zip(num_boxes, data.boxes)
        ]
        features = [
            np.frombuffer(base64.b64decode(feature),
                          dtype=np.float32).reshape(num_boxe, 2048)
            for num_boxe, feature in zip(num_boxes, data.features)
        ]
        class_labels = [
            np.frombuffer(base64.b64decode(class_label),
                          dtype=np.int64).reshape(num_boxe)
            for num_boxe, class_label in zip(num_boxes, data.class_labels)
        ]
        class_labels_names = [
            self.labels_dict[index] for index in class_labels
        ]
        class_labels_vector = [
            self.labels_vector_dict[index] for index in class_labels
        ]
        query = data["query"].values
        if len(query) < 512:
            query_vector = self.text2vec.encode(query)
        else:
            query_vector = [self.text2vec.encode(q)[0] for q in tqdm(query)]
        query_id = data.query_id.values
        return {
            "product_id": product_id,
            "image_h": image_h,
            "image_w": image_w,
            "num_boxes": num_boxes,
            "boxes": boxes,
            "features": features,
            "class_labels": class_labels,
            "class_labels_names": class_labels_names,
            "class_labels_vector": class_labels_vector,
            "query": query,
            "query_vector": query_vector,
            "query_id": query_id
        }

    def train_next(self, index=0):
        while True:
            try:
                data = next(self.train_datas)
                data = self.deal_data(data, index)
                query_vector = data["query_vector"]

                boxe_vector = self.build_boxe_vector(data["image_h"],
                                                     data["image_w"],
                                                     data["boxes"])
                boxes_mask = self.build_mask(data["num_boxes"])
                boxe_vector = self.expand_zeros(boxe_vector)
                feature_vector = self.expand_zeros(data["features"])
                label_vector = self.expand_zeros(data["class_labels_vector"])
                triplet_labels = self.build_triplet_labels(
                    data["query_id"], data["query"])
                return {
                    "query_vector": query_vector,
                    "boxes_mask": boxes_mask,
                    "boxe_vector": boxe_vector,
                    "feature_vector": feature_vector,
                    "label_vector": label_vector,
                    "triplet_labels": triplet_labels
                }
            except StopIteration as e:
                print(e)
                self.train_datas = pd.read_csv(TRAIN_PATH,
                                               sep="\t",
                                               chunksize=self.batch)
            except Exception as e:
                print(e)

    def build_boxe_vector(self, image_h, image_w, boxes):
        areas = image_h * image_w
        areas_ratio = [
            np.array([[(x2 - x1) / 2 / w, (y2 - y1) / 2 / h,
                       (x2 - x1) * (y2 - y1) / area]
                      for y1, x1, y2, x2 in boxe])
            for boxe, area, h, w in zip(boxes, areas, image_h, image_w)
        ]
        return areas_ratio

    def build_triplet_labels(self, query_id, query):
        unrepeat_query_id, unrepeat_query = set(query_id), set(query)
        unrepeat_query_id = list(unrepeat_query_id)
        triplet_labels = np.array(
            [unrepeat_query_id.index(id) for id in query_id])
        triplet_labels = np.concatenate((triplet_labels, triplet_labels))
        # assert len(unrepeat_query_id)==len(unrepeat_query), 'query_id != query'
        # triplet_labels = np.concatenate((np.arange(0, len(query_id)), np.arange(0, len(query_id))))
        return triplet_labels

    def expand_zeros(self, params):
        res = np.array([
            np.pad(param, ((0, self.max_boxes_num - len(param)), (0, 0)),
                   mode='constant')
            if len(param) < self.max_boxes_num else param[:self.max_boxes_num]
            for param in params
        ])
        return res

    def build_mask(self, param):
        masks = np.zeros((len(param), self.max_boxes_num))
        for i, p in enumerate(param):
            masks[i, :p] = 1
        return masks
Esempio n. 6
0
 def __init__(self):
     super().__init__()
     self.text2vec = BertEncode(graph_path=None)
     logger.info("\033[1;32mbert initialize ok\033[0m")
Esempio n. 7
0
    for qid in reference.keys():
        ground_truth_ids = set([str(pid) for pid in reference[qid]])
        ref_vec = [1.0] * len(ground_truth_ids)
        pred_vec = [
            1.0 if pid in ground_truth_ids else 0.0 for pid in predictions[qid]
        ]
        score_sum += get_ndcg(pred_vec, ref_vec, k)
    # the higher score, the better
    score = score_sum / len(reference)
    return score


if __name__ == '__main__':
    valid_answer = json.load(open(VALID_ANSWER_PATH, "r"))
    batch = 256
    text2vec = BertEncode(graph_path=None)
    dataset = Dataset(batch=batch)
    model = Model()
    saver = tf.train.Saver(max_to_keep=50)
    with tf.Session() as sess:
        saver.restore(sess, tf.train.latest_checkpoint(USER_MODEL_DATA_PATH))

        valid_datas = pd.read_csv(VALID_PATH, sep="\t")
        for name, group in valid_datas.groupby("query_id"):
            distances = {}
            for index, tup in enumerate(group.itertuples()):
                product_id = tup.product_id
                image_h = tup.image_h
                image_w = tup.image_w
                num_boxes = tup.num_boxes
                boxes = np.frombuffer(base64.b64decode(tup.boxes),
Esempio n. 8
0
import sys

from common.config import logger

sys.path.append("../../")
sys = platform.system()
if sys == "Linux":
    # 自动选择空闲显卡
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
    memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()]
    gpu_id = memory_gpu.index(max(memory_gpu))
    os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
    logger.info(f"\033[1;32m使用{gpu_id}号gpu\033[0m")

app = Flask(__name__)
text2vec = BertEncode(graph_path=None)


def flask_content_type(requests):
    """根据不同的content_type来解析数据"""
    if requests.method == 'POST':
        if requests.content_type == 'application/x-www-form-urlencoded':
            data = requests.form
        elif requests.content_type == 'application/json':
            data = requests.json
        else:  # 无法被解析出来的数据
            data = json.loads(requests.data)
        return data
    elif requests.method == 'GET':
        return requests.args
Esempio n. 9
0
class BertQueue(object):
    output_queue = {}

    def __init__(self, maxsize=32):
        self.input_queue = Queue(maxsize=maxsize)
        self.text2vec = BertEncode(graph_path=None)
        t = threading.Thread(target=self.run)
        t.setDaemon(True)
        t.start()
        logger.info("\033[1;32mbert initialize ok\033[0m")

    def put(self, sentences):
        token = uuid.uuid1().hex
        self.input_queue.put(Respect(token=token, values=sentences))
        return token

    def get(self, token):
        while True:
            if token in self.output_queue.keys():
                result = self.output_queue.pop(token)
                return result
            time.sleep(0.005)

    def run(self):
        inputs = []
        tokens = []
        start = time.time()
        while True:
            try:
                respect = self.input_queue.get(block=True, timeout=0.001)
                token = respect.token
                sentences = respect.values
                inputs.extend(sentences)
                tokens.extend([token] * len(sentences))
            except Empty as e:
                continue
            except Exception as e:
                logger.error(str(e))
                continue
            finally:
                end = time.time()
                interval = end - start
                if len(inputs) > 64 or (interval > 0.1 and len(inputs) > 0):
                    logger.info(f"batch size: {len(inputs)}, time: {interval}")
                    if len(inputs) > 512:
                        vectors = []
                        n = int(len(inputs) // 64) + 1
                        for i in range(n):
                            sentences = inputs[i * 64:(i + 1) * 64]
                            if len(sentences) == 0:
                                continue
                            vector = self.text2vec.encode(sentences)
                            vectors.append(vector)
                        vectors = np.concatenate(vectors, axis=0)
                    else:
                        vectors = self.text2vec.encode(inputs)
                    info = pd.DataFrame(np.array([tokens, inputs]).T,
                                        columns=["tokens", "inputs"
                                                 ]).groupby("tokens").indices
                    reply_vectors = {
                        k: [vectors[i] for i in v]
                        for k, v in info.items()
                    }
                    self.output_queue.update(reply_vectors)
                    inputs = []
                    tokens = []
                    start = time.time()