def __init__(self, maxsize=32): self.input_queue = Queue(maxsize=maxsize) self.text2vec = BertEncode(graph_path=None) t = threading.Thread(target=self.run) t.setDaemon(True) t.start() logger.info("\033[1;32mbert initialize ok\033[0m")
def __init__(self, batch=10000, max_boxes_num=10): self.max_boxes_num = max_boxes_num self.batch = batch self.text2vec = BertEncode(graph_path=None) self.labels_dict = pd.read_csv(LABELS_NAMES_PATH, sep="\t").values[:, 1] self.labels_vector_dict = self.text2vec.encode(self.labels_dict) self.train_datas = pd.read_csv(TRAIN_PATH, sep="\t", chunksize=self.batch)
class Greeter(bert_server_pb2_grpc.BertServetServicer): def __init__(self): super().__init__() self.text2vec = BertEncode(graph_path=None) logger.info("\033[1;32mbert initialize ok\033[0m") def get_vectors(self, request, context): vectors = self.text2vec.encode(request.sentences) vectors = [bert_server_pb2.Vector(vector=vector) for vector in vectors] vectors = bert_server_pb2.Vectors(vectors=vectors) return vectors def get_vector(self, request, context): vector = self.text2vec.encode(request.sentence)[0] vector = bert_server_pb2.Vector(vector=vector) return vector
class Greeter(bert_server_queue_pb2_grpc.BertServetServicer): def __init__(self): super().__init__() self.text2vec = BertEncode(graph_path=None) logger.info("\033[1;32mbert initialize ok\033[0m") def get_vectors(self, request, context): inputs = [] tokens = [] start = time.time() for texts in request: inputs.extend(texts.sentences) tokens.extend([texts.token] * len(texts.sentences)) end = time.time() interval = end - start if len(inputs) > 32 or interval > 3: info = pd.DataFrame(np.array([tokens, inputs]).T, columns=["tokens", "inputs" ]).groupby("tokens").indices vectors = self.text2vec.encode(inputs) vectors = [ bert_server_queue_pb2.Vector(vector=vector) for vector in vectors ] reply_vectors = [ bert_server_queue_pb2.Vectors( token=k, vectors=[vectors[i] for i in v]) for k, v in info.items() ] reply_vectors = bert_server_queue_pb2.ReplyVectors( reply=reply_vectors) yield reply_vectors inputs = [] tokens = [] start = time.time() def get_vector(self, request, context): vector = self.text2vec.encode(request.sentence)[0] vector = bert_server_queue_pb2.Vector(vector=vector) return vector
class Dataset(object): def __init__(self, batch=10000, max_boxes_num=10): self.max_boxes_num = max_boxes_num self.batch = batch self.text2vec = BertEncode(graph_path=None) self.labels_dict = pd.read_csv(LABELS_NAMES_PATH, sep="\t").values[:, 1] self.labels_vector_dict = self.text2vec.encode(self.labels_dict) self.train_datas = pd.read_csv(TRAIN_PATH, sep="\t", chunksize=self.batch) def deal_data(self, data, index): product_id = data.product_id.values image_h = data.image_h.values image_w = data.image_w.values num_boxes = data.num_boxes.values boxes = [ np.frombuffer(base64.b64decode(boxe), dtype=np.float32).reshape(num_boxe, 4) for num_boxe, boxe in zip(num_boxes, data.boxes) ] features = [ np.frombuffer(base64.b64decode(feature), dtype=np.float32).reshape(num_boxe, 2048) for num_boxe, feature in zip(num_boxes, data.features) ] class_labels = [ np.frombuffer(base64.b64decode(class_label), dtype=np.int64).reshape(num_boxe) for num_boxe, class_label in zip(num_boxes, data.class_labels) ] class_labels_names = [ self.labels_dict[index] for index in class_labels ] class_labels_vector = [ self.labels_vector_dict[index] for index in class_labels ] query = data["query"].values if len(query) < 512: query_vector = self.text2vec.encode(query) else: query_vector = [self.text2vec.encode(q)[0] for q in tqdm(query)] query_id = data.query_id.values return { "product_id": product_id, "image_h": image_h, "image_w": image_w, "num_boxes": num_boxes, "boxes": boxes, "features": features, "class_labels": class_labels, "class_labels_names": class_labels_names, "class_labels_vector": class_labels_vector, "query": query, "query_vector": query_vector, "query_id": query_id } def train_next(self, index=0): while True: try: data = next(self.train_datas) data = self.deal_data(data, index) query_vector = data["query_vector"] boxe_vector = self.build_boxe_vector(data["image_h"], data["image_w"], data["boxes"]) boxes_mask = self.build_mask(data["num_boxes"]) boxe_vector = self.expand_zeros(boxe_vector) feature_vector = self.expand_zeros(data["features"]) label_vector = self.expand_zeros(data["class_labels_vector"]) triplet_labels = self.build_triplet_labels( data["query_id"], data["query"]) return { "query_vector": query_vector, "boxes_mask": boxes_mask, "boxe_vector": boxe_vector, "feature_vector": feature_vector, "label_vector": label_vector, "triplet_labels": triplet_labels } except StopIteration as e: print(e) self.train_datas = pd.read_csv(TRAIN_PATH, sep="\t", chunksize=self.batch) except Exception as e: print(e) def build_boxe_vector(self, image_h, image_w, boxes): areas = image_h * image_w areas_ratio = [ np.array([[(x2 - x1) / 2 / w, (y2 - y1) / 2 / h, (x2 - x1) * (y2 - y1) / area] for y1, x1, y2, x2 in boxe]) for boxe, area, h, w in zip(boxes, areas, image_h, image_w) ] return areas_ratio def build_triplet_labels(self, query_id, query): unrepeat_query_id, unrepeat_query = set(query_id), set(query) unrepeat_query_id = list(unrepeat_query_id) triplet_labels = np.array( [unrepeat_query_id.index(id) for id in query_id]) triplet_labels = np.concatenate((triplet_labels, triplet_labels)) # assert len(unrepeat_query_id)==len(unrepeat_query), 'query_id != query' # triplet_labels = np.concatenate((np.arange(0, len(query_id)), np.arange(0, len(query_id)))) return triplet_labels def expand_zeros(self, params): res = np.array([ np.pad(param, ((0, self.max_boxes_num - len(param)), (0, 0)), mode='constant') if len(param) < self.max_boxes_num else param[:self.max_boxes_num] for param in params ]) return res def build_mask(self, param): masks = np.zeros((len(param), self.max_boxes_num)) for i, p in enumerate(param): masks[i, :p] = 1 return masks
def __init__(self): super().__init__() self.text2vec = BertEncode(graph_path=None) logger.info("\033[1;32mbert initialize ok\033[0m")
for qid in reference.keys(): ground_truth_ids = set([str(pid) for pid in reference[qid]]) ref_vec = [1.0] * len(ground_truth_ids) pred_vec = [ 1.0 if pid in ground_truth_ids else 0.0 for pid in predictions[qid] ] score_sum += get_ndcg(pred_vec, ref_vec, k) # the higher score, the better score = score_sum / len(reference) return score if __name__ == '__main__': valid_answer = json.load(open(VALID_ANSWER_PATH, "r")) batch = 256 text2vec = BertEncode(graph_path=None) dataset = Dataset(batch=batch) model = Model() saver = tf.train.Saver(max_to_keep=50) with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint(USER_MODEL_DATA_PATH)) valid_datas = pd.read_csv(VALID_PATH, sep="\t") for name, group in valid_datas.groupby("query_id"): distances = {} for index, tup in enumerate(group.itertuples()): product_id = tup.product_id image_h = tup.image_h image_w = tup.image_w num_boxes = tup.num_boxes boxes = np.frombuffer(base64.b64decode(tup.boxes),
import sys from common.config import logger sys.path.append("../../") sys = platform.system() if sys == "Linux": # 自动选择空闲显卡 os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp') memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()] gpu_id = memory_gpu.index(max(memory_gpu)) os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" logger.info(f"\033[1;32m使用{gpu_id}号gpu\033[0m") app = Flask(__name__) text2vec = BertEncode(graph_path=None) def flask_content_type(requests): """根据不同的content_type来解析数据""" if requests.method == 'POST': if requests.content_type == 'application/x-www-form-urlencoded': data = requests.form elif requests.content_type == 'application/json': data = requests.json else: # 无法被解析出来的数据 data = json.loads(requests.data) return data elif requests.method == 'GET': return requests.args
class BertQueue(object): output_queue = {} def __init__(self, maxsize=32): self.input_queue = Queue(maxsize=maxsize) self.text2vec = BertEncode(graph_path=None) t = threading.Thread(target=self.run) t.setDaemon(True) t.start() logger.info("\033[1;32mbert initialize ok\033[0m") def put(self, sentences): token = uuid.uuid1().hex self.input_queue.put(Respect(token=token, values=sentences)) return token def get(self, token): while True: if token in self.output_queue.keys(): result = self.output_queue.pop(token) return result time.sleep(0.005) def run(self): inputs = [] tokens = [] start = time.time() while True: try: respect = self.input_queue.get(block=True, timeout=0.001) token = respect.token sentences = respect.values inputs.extend(sentences) tokens.extend([token] * len(sentences)) except Empty as e: continue except Exception as e: logger.error(str(e)) continue finally: end = time.time() interval = end - start if len(inputs) > 64 or (interval > 0.1 and len(inputs) > 0): logger.info(f"batch size: {len(inputs)}, time: {interval}") if len(inputs) > 512: vectors = [] n = int(len(inputs) // 64) + 1 for i in range(n): sentences = inputs[i * 64:(i + 1) * 64] if len(sentences) == 0: continue vector = self.text2vec.encode(sentences) vectors.append(vector) vectors = np.concatenate(vectors, axis=0) else: vectors = self.text2vec.encode(inputs) info = pd.DataFrame(np.array([tokens, inputs]).T, columns=["tokens", "inputs" ]).groupby("tokens").indices reply_vectors = { k: [vectors[i] for i in v] for k, v in info.items() } self.output_queue.update(reply_vectors) inputs = [] tokens = [] start = time.time()