Ejemplo n.º 1
0
def create_document(doc_id, text, weight, length):
    d = Document()
    d._document.id = (str(doc_id) * 16)[:16]
    d.buffer = text.encode('utf8')
    d.weight = weight
    d.length = length
    return d
Ejemplo n.º 2
0
def search_generator(path: str, buffer: bytes):
    d = Document()
    if buffer:
        d.buffer = buffer
    if path:
        d.content = path
    yield d
Ejemplo n.º 3
0
def input_index_data(num_docs=None, batch_size=8, dataset_type='f30k'):
    captions = 'dataset_flickr30k.json' if dataset_type == 'f30k' else 'captions.txt'
    if dataset_type == 'toy-data':
        base_folder = '.'
    else:
        base_folder = 'data'
    data_loader = get_data_loader(
        root=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/images'),
        captions=os.path.join(cur_dir,
                              f'{base_folder}/{dataset_type}/{captions}'),
        split='test',
        batch_size=batch_size,
        dataset_type=dataset_type)

    for i, (images, captions) in enumerate(data_loader):
        for image, caption in zip(images, captions):
            hashed = hashlib.sha1(image).hexdigest()
            document_img = Document()

            document_img.buffer = image
            document_img.modality = 'image'
            document_img.mime_type = 'image/jpeg'

            document_caption = Document(id=hashed)

            document_caption.text = caption
            document_caption.modality = 'text'
            document_caption.mime_type = 'text/plain'
            document_caption.tags['id'] = caption

            yield document_img
            yield document_caption

        if num_docs and (i + 1) * batch_size >= num_docs:
            break
Ejemplo n.º 4
0
def create_document(doc_id, text, weight, length):
    d = Document()
    d.id = doc_id
    d.buffer = text.encode('utf8')
    d.weight = weight
    d.length = length
    return d
Ejemplo n.º 5
0
 def create(self):
     gt = Document()
     if field_type == 'text':
         gt.text = 'aaaa'
     elif field_type == 'buffer':
         gt.buffer = b'\x01\x02\x03\04'
     elif field_type == 'blob':
         gt.blob = np.array([1, 1, 1, 1])
     return gt
Ejemplo n.º 6
0
 def create(self):
     doc = Document()
     if field_type == 'text':
         doc.text = 'aaa'
     elif field_type == 'buffer':
         doc.buffer = b'\x01\x02\x03'
     elif field_type == 'blob':
         doc.blob = np.array([1, 1, 1])
     return doc
Ejemplo n.º 7
0
 def request(field_type):
     num_docs = 10
     req = jina_pb2.RequestProto()
     for idx in range(num_docs):
         doc = req.index.docs.add()
         gt = req.index.groundtruths.add()
         chunk_doc = Document(doc.chunks.add())
         chunk_gt = Document(gt.chunks.add())
         chunk_doc.granularity = 1
         chunk_gt.granularity = 1
         if field_type == 'text':
             chunk_doc.text = 'aaa'
             chunk_gt.text = 'aaaa'
         elif field_type == 'buffer':
             chunk_doc.buffer = b'\x01\x02\x03'
             chunk_gt.buffer = b'\x01\x02\x03\x04'
         elif field_type == 'blob':
             chunk_doc.blob = np.array([1, 1, 1])
             chunk_gt.blob = np.array([1, 1, 1, 1])
     return Request(req).as_typed_request('index')
Ejemplo n.º 8
0
def create_document(doc_id, text, weight):
    d = Document()
    d.id = str(doc_id)
    d.buffer = text.encode('utf8')
    d.weight = weight
    return d