Example #1
0
 def test_new_msg(self):
     a = gnes_pb2.Message()
     a.response.index.status = gnes_pb2.Response.SUCCESS
     print(a)
     a.request.train.docs.extend([gnes_pb2.Document() for _ in range(2)])
     print(a)
     a.request.train.ClearField('docs')
     a.request.train.docs.extend([gnes_pb2.Document() for _ in range(3)])
     print(a)
Example #2
0
def img_process_for_test(dirname):
    zipfile_ = zipfile.ZipFile(os.path.join(dirname, 'imgs/test.zip'))
    all_bytes = [zipfile_.open(v).read() for v in zipfile_.namelist()]
    test_img = []
    for raw_bytes in all_bytes:
        d = gnes_pb2.Document()
        d.raw_bytes = raw_bytes
        test_img.append(d)

    test_img_all_preprocessor = []
    pipline_prep1 = PipelinePreprocessor()
    pipline_prep1.components = lambda: [
        UnaryPreprocessor(doc_type=gnes_pb2.Document.IMAGE),
        ResizeChunkPreprocessor()
    ]
    pipline_prep2 = PipelinePreprocessor()
    pipline_prep2.components = lambda: [
        VanillaSlidingPreprocessor(),
        ResizeChunkPreprocessor()
    ]

    for preprocessor in [pipline_prep1, pipline_prep2]:
        test_img_copy = copy.deepcopy(test_img)
        for img in test_img_copy:
            preprocessor.apply(img)
        test_img_all_preprocessor.append([
            blob2array(chunk.blob) for img in test_img_copy
            for chunk in img.chunks
        ])
    return test_img_all_preprocessor
Example #3
0
    def query(self, keys: List[int], *args,
              **kwargs) -> List['gnes_pb2.Document']:
        """

        :param keys: list of doc id
        :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk)
        """
        res = []
        for k in keys:
            doc = gnes_pb2.Document()
            target_dirs = os.path.join(self.data_path, str(k))

            if not os.path.exists(target_dirs):
                if self.keep_na_doc:
                    res.append(self._NOT_FOUND)
            else:
                with open(os.path.join(target_dirs, '.meta'), 'rb') as f:
                    doc.meta_info = f.read()
                for raw_file in os.listdir(target_dirs):
                    if not os.path.isdir(raw_file):
                        c = doc.chunks.add()
                        c.doc_id = k
                        self.logger.error(
                            "the query method has not been implemented!")
                        # with open(os.path.join(target_dirs, raw_file),
                        #           'rb') as raw:
                        #     c.raw = raw.read()
                res.append(doc)
        return res
Example #4
0
 def test_preprocessor_service_echo(self):
     args = set_preprocessor_service_parser().parse_args([])
     c_args = _set_client_parser().parse_args([
         '--port_in', str(args.port_out),
         '--port_out', str(args.port_in)
     ])
     with PreprocessorService(args), ZmqClient(c_args) as client:
         msg = gnes_pb2.Message()
         msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)])
         client.send_message(msg)
         r = client.recv_message()
         print(r)
         msg.request.train.docs.extend([gnes_pb2.Document() for _ in range(5)])
         client.send_message(msg)
         r = client.recv_message()
         print(r)
Example #5
0
    def test_webp_encoder(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        doc.raw_video.CopyFrom(raw_data)
        self.webp_encoder.apply(doc)
        doc1 = copy.deepcopy(doc)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)
        self.webp_encoder.apply(doc)
        doc2 = copy.deepcopy(doc)

        self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
Example #6
0
    def test_videoshot_indexer(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)

        self.indexer.add([0], [doc])
Example #7
0
    def init_db(self):
        self.db = DirectoryIndexer(self.data_path)

        self.d = gnes_pb2.Document()
        self.d.doc_id = 0
        self.d.raw_bytes = self.video_bytes[0]

        preprocess = BasePreprocessor.load_yaml(self.pipeline_yml_path)
        preprocess.apply(self.d)

        self.db.add(list(range(len(self.video_bytes))), [self.d])
Example #8
0
    def test_dump_load(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        doc.raw_video.CopyFrom(raw_data)
        self.mp4_encoder.apply(doc)
        doc1 = copy.deepcopy(doc)

        self.mp4_encoder.dump(self.dump_path)

        encoder = BaseVideoPreprocessor.load(self.dump_path)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)
        encoder.apply(doc)
        doc2 = copy.deepcopy(doc)

        self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
Example #9
0
    def test_gif_pipelinepreproces(self):
        d = gnes_pb2.Document()
        d.raw_bytes = self.video_bytes[0]
        d_ = copy.deepcopy(d)

        p3 = FFmpegVideoSegmentor.load_yaml(self.ffmpeg_yaml_path)
        p3.apply(d)

        p4 = BasePreprocessor.load_yaml(self.pipeline_path)
        p4.apply(d_)

        self.assertEqual(len(d.chunks), len(d_.chunks))
Example #10
0
    def setUp(self) -> None:
        self.doc = gnes_pb2.Document()

        c1 = self.doc.chunks.add()
        c1.blob.CopyFrom(array2blob(np.array([[1, 2, 3], [2, 3, 4]])))

        c2 = self.doc.chunks.add()
        c2.blob.CopyFrom(
            array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3]])))

        c3 = self.doc.chunks.add()
        c3.blob.CopyFrom(
            array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]])))
Example #11
0
    def test_dump_load(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)

        doc1 = copy.deepcopy(doc)

        self.indexer.dump(self.dump_path)

        indexer = BaseDocIndexer.load(self.dump_path)

        indexer.add([0], [doc1])
    def test_pipelinepreproces(self):
        p3 = PipelinePreprocessor()
        p3.components = lambda: [P1(), P2()]
        d = gnes_pb2.Document()
        d.doc_id = 1
        p3.apply(d)
        self.assertEqual(d.doc_id, 6)

        p3.name = self.p3_name
        p3.dump_yaml()
        p3.dump()

        p4 = BasePreprocessor.load_yaml(p3.yaml_full_path)
        p4.apply(d)
        self.assertEqual(d.doc_id, 21)
Example #13
0
def img_process_for_test(dirname):
    zipfile_ = zipfile.ZipFile(os.path.join(dirname, 'imgs/test.zip'), "r")
    all_bytes = [zipfile_.open(v).read() for v in zipfile_.namelist()]
    test_img = []
    for raw_bytes in all_bytes:
        d = gnes_pb2.Document()
        d.raw_bytes = raw_bytes
        test_img.append(d)

    test_img_all_preprocessor = []
    for preprocessor in [BaseSingletonPreprocessor(doc_type=gnes_pb2.Document.IMAGE),
                         VanillaSlidingPreprocessor()]:
        test_img_copy = copy.deepcopy(test_img)
        for img in test_img_copy:
            preprocessor.apply(img)
        test_img_all_preprocessor.append([blob2array(chunk.blob)
                                          for img in test_img_copy for chunk in img.chunks])
    return test_img_all_preprocessor
Example #14
0
 def test_map_router(self):
     args = set_router_parser().parse_args([
         '--yaml_path', self.batch_router_yaml,
     ])
     c_args = _set_client_parser().parse_args([
         '--port_in', str(args.port_out),
         '--port_out', str(args.port_in),
     ])
     with RouterService(args), ZmqClient(c_args) as c1:
         msg = gnes_pb2.Message()
         msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)])
         c1.send_message(msg)
         r = c1.recv_message()
         self.assertEqual(len(r.request.index.docs), 2)
         r = c1.recv_message()
         self.assertEqual(len(r.request.index.docs), 2)
         r = c1.recv_message()
         self.assertEqual(len(r.request.index.docs), 1)
Example #15
0
def line2pb_doc(line: str, doc_id: int = 0, deliminator: str = r'[.。!?!?]+') -> 'gnes_pb2.Document':
    doc = gnes_pb2.Document()
    doc.doc_id = doc_id
    doc.doc_type = gnes_pb2.Document.TEXT
    doc.meta_info = line.encode()
    if deliminator:
        for ci, s in enumerate(re.split(deliminator, line)):
            if s.strip():
                c = doc.chunks.add()
                c.doc_id = doc_id
                c.text = s
                c.offset_1d = ci
    else:
        c = doc.chunks.add()
        c.doc_id = doc_id
        c.text = line
        c.offset_1d = 0
    return doc
Example #16
0
 def test_publish_router(self):
     args = set_router_parser().parse_args([
         '--yaml_path', self.publish_router_yaml,
         '--socket_out', str(SocketType.PUB_BIND)
     ])
     c_args = _set_client_parser().parse_args([
         '--port_in', str(args.port_out),
         '--port_out', str(args.port_in),
         '--socket_in', str(SocketType.SUB_CONNECT)
     ])
     with RouterService(args), ZmqClient(c_args) as c1, ZmqClient(c_args) as c2:
         msg = gnes_pb2.Message()
         msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)])
         msg.envelope.num_part.append(1)
         c1.send_message(msg)
         r = c1.recv_message()
         self.assertSequenceEqual(r.envelope.num_part, [1, 2])
         r = c2.recv_message()
         self.assertSequenceEqual(r.envelope.num_part, [1, 2])
Example #17
0
 def query(self, keys: List[int], *args,
           **kwargs) -> List['gnes_pb2.Document']:
     self.logger.error(keys)
     return [Parse(self._content[k], gnes_pb2.Document()) for k in keys]
Example #18
0
 def test_empty_doc(self):
     doc = gnes_pb2.Document()
     doc.doc_type = gnes_pb2.Document.VIDEO
     self.mp4_encoder.apply(doc)
Example #19
0
 def test_emtpy_document(self):
     frame_selector = FrameSelectPreprocessor(sframes=-1)
     frame_selector.apply(gnes_pb2.Document())
Example #20
0
    def test_doc_combine_score_fn(self):
        from gnes.indexer.doc.dict import DictIndexer

        document_list = []
        document_id_list = []

        for j in range(1, 4):
            d = gnes_pb2.Document()
            for i in range(1, 4):
                c = d.chunks.add()
                c.doc_id = j
                c.offset = i
                c.weight = 1 / 3
            document_id_list.append(j)
            document_list.append(d)

        self.chunk_router_yaml = 'Chunk2DocTopkReducer'

        args = set_router_parser().parse_args([
            '--yaml_path', self.chunk_router_yaml, '--socket_out',
            str(SocketType.PUB_BIND)
        ])
        c_args = _set_client_parser().parse_args([
            '--port_in',
            str(args.port_out), '--port_out',
            str(args.port_in), '--socket_in',
            str(SocketType.SUB_CONNECT)
        ])
        with RouterService(args), ZmqClient(c_args) as c1:
            msg = gnes_pb2.Message()
            s = msg.response.search.topk_results.add()
            s.score.value = 0.1
            s.score.explained = '"1-c1"'
            s.chunk.doc_id = 1

            s = msg.response.search.topk_results.add()
            s.score.value = 0.2
            s.score.explained = '"1-c2"'
            s.chunk.doc_id = 2

            s = msg.response.search.topk_results.add()

            s.score.value = 0.3
            s.score.explained = '"1-c3"'
            s.chunk.doc_id = 1

            msg.envelope.num_part.extend([1, 2])
            c1.send_message(msg)

            msg.response.search.ClearField('topk_results')

            s = msg.response.search.topk_results.add()
            s.score.value = 0.2
            s.score.explained = '"2-c1"'
            s.chunk.doc_id = 1

            s = msg.response.search.topk_results.add()
            s.score.value = 0.2
            s.score.explained = '"2-c2"'
            s.chunk.doc_id = 2

            s = msg.response.search.topk_results.add()
            s.score.value = 0.3
            s.score.explained = '"2-c3"'
            s.chunk.doc_id = 3
            c1.send_message(msg)
            r = c1.recv_message()
            doc_indexer = DictIndexer(score_fn=CoordDocScoreFn())
            doc_indexer.add(keys=document_id_list, docs=document_list)

            queried_result = doc_indexer.query_and_score(
                docs=r.response.search.topk_results, top_k=2)