Ejemplo n.º 1
0
def upload_image():
    data = request.get_json(force=True)
    image_url = data.get('image_url')
    doc_id = data.get('doc_id', None)
    source = data.get('source', 'tattle-admin')
    if image_url is None:
        ret = {'failed': 1, 'error': 'No image_url found'}
    else:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        image_vec = resnet18.extract_feature(image)

        detected_text = detect_text(image_dict['image_bytes']).get('text', '')
        lang = detect_lang(detected_text)

        #import ipdb; ipdb.set_trace()
        if detected_text == '' or None:
            text_vec = np.zeros(300).tolist()
            has_text = False
        else:
            text_vec = doc2vec(detected_text)
            has_text = True

        if lang is None:
            text_vec = np.zeros(300).tolist()
            has_text = True

        if text_vec is None:
            text_vec = np.zeros(300).tolist()
            has_text = True

        vec = np.hstack((image_vec, text_vec)).tolist()

        date = datetime.datetime.now()
        if doc_id is None:
            doc_id = uuid.uuid4().int
        db.docs.insert_one({
            "doc_id": doc_id,
            "source": source,
            "version": "1.1",
            "has_image": True,
            "has_text": has_text,
            "text": detected_text,
            "tags": [],
            "date_added": date,
            "date_updated": date,
            "image_vec": image_vec.tolist(),
            "text_vec": text_vec,
            "vec": vec,
        })
        ret = {'doc_id': doc_id, 'failed': 0}

        #update the search index
        imagesearch.update(doc_id, image_vec)
        docsearch.update(doc_id, vec)
        if has_text:
            textsearch.update(doc_id, text_vec)

    return jsonify(ret)
Ejemplo n.º 2
0
def find_duplicate():
    data = request.get_json(force=True)
    text = data.get('text', None)
    thresh = data.get('threshold')
    image_url = data.get('image_url', None)
    if text is None and image_url is None:
        ret = {'failed': 1, 'error': 'No text or image_url found'}

    elif image_url is not None:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        vec = resnet18.extract_feature(image)
        if thresh:
            doc_id, dist = imagesearch.search(vec, thresh)
        else:
            doc_id, dist = imagesearch.search(vec)

        if doc_id is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': doc_id,
                'distance': dist
            }
        else:
            ret = {'failed': 0, 'duplicate': 0}

    elif text is not None:
        duplicate_doc = db.docs.find_one({"text": text})
        vec = doc2vec(text)
        if thresh:
            doc_id, dist = textsearch.search(vec, thresh)
        else:
            doc_id, dist = textsearch.search(vec)
        if duplicate_doc is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': duplicate_doc.get('doc_id')
            }
        elif doc_id is not None:
            ret = {
                'failed': 0,
                'duplicate': 1,
                'doc_id': doc_id,
                'distance': dist
            }
        else:
            ret = {'failed': 0, 'duplicate': 0}

    else:
        ret = {'failed': 1, 'error': 'something went wrong'}

    return jsonify(ret)
Ejemplo n.º 3
0
def get_random_image(opt=None,
                     image_url=None,
                     images=None,
                     db=None,
                     DATA_SOURCE=None):
    # returns random image from data source
    from numpy.random import randint

    if opt == 'from_sample':
        num_images = len(images)
        if num_images == 1:
            idx = 0
            image_url = images[0][1]
        else:
            idx = randint(len(images))
            image_url = images[idx][1]
        image = image_from_url(image_url)['image']
        return idx, image
    elif opt == 'from_url':
        image = image_from_url(image_url)['image']
        return 0, image
    elif DATA_SOURCE == 'single_file':
        num_images = len(images)
        i = randint(num_images)

        image_url = images[i][1]
        image = image_from_url(image_url)['image']

        return i, image
    elif DATA_SOURCE == 'api_database':
        # image_url = db.aggregate([{"$sample": 1}])
        pass
    elif DATA_SOURCE == 'scraping_database':
        query = db.aggregate([{"$sample": {"size": 1}}])

        image_url = list(query)[0]['s3URL'].replace('/home/ubuntu/Downloads//',
                                                    '')
        image = image_from_url(image_url)['image']

        return 0, image
    else:
        return None
Ejemplo n.º 4
0
def imageTesting():
    db_type = 'testing'
    images = [(1, 'https://picsum.photos/id/448/1024/768')]
    imageSearch = ImageSearch(db_type, images)
    imageSearch.thresh = 20  # to pass the following tests

    assert (len(imageSearch.vecs) > 0)

    transforms = [
        'crop', 'rotate', 'invert', 'mirror', 'BLUR', 'CONTOUR', 'DETAIL',
        'EDGE_ENHANCE', 'EDGE_ENHANCE', 'EDGE_ENHANCE_MORE', 'EMBOSS',
        'FIND_EDGES', 'SMOOTH', 'SMOOTH_MORE', 'SHARPEN', 'GaussianBlur',
        'UnsharpMask', 'MedianFilter', 'MinFilter', 'MaxFilter', 'ModeFilter'
    ]

    for i, image in images:
        image = image_from_url(image)['image']

        for tf in transforms:
            imageTransformed = imageTransforms(image, type=tf)
            imageTransformedVec = img2vec(imageTransformed, type='image')

            ret = imageSearch.search(imageTransformedVec)
            print(f'{i}=>{ret[0]}, {tf}: {ret[1]}')

            if ret[0] is not None:
                ret_image = image_from_url(images[ret[0] - 1][1])['image']

            f, ax = plt.subplots(nrows=3, ncols=1)

            ax[0].imshow(image)
            ax[0].set_title('original image')
            ax[1].imshow(imageTransformed)
            ax[1].set_title(f'{tf} image')
            if ret[0] is not None:
                ax[2].imshow(ret_image)
                ax[2].set_title(f'similar image: {ret[1]:.2f}')

            mgr = plt.get_current_fig_manager()
            mgr.window.state('zoomed')
            plt.show()
Ejemplo n.º 5
0
def get_similar(image, num_similar, imageSearch, images):
    image_vec = img2vec(image, type='image')
    ret = imageSearch.search(image_vec, n=3)

    ret_image = []
    for i, (doc_id, x) in enumerate(ret):
        if doc_id is not None:
            ret_image += [image_from_url(images[ret[i][0]][1])['image']]
        else:
            ret_image.append(None)

    return ret_image, ret
Ejemplo n.º 6
0
def get_transform(image, tf, param, imageSearch, doc_id=0, images=None):
    from transforms import imageTransforms

    imageTransformed = imageTransforms(image, type=tf, param=param)
    imageTransformedVec = img2vec(imageTransformed, type='image')

    ret = imageSearch.search(imageTransformedVec)
    # only look at the top result, default is only one result
    new_doc_id = ret[0][0]
    new_doc_dist = ret[0][1]

    print(f'doc_id {doc_id} => {new_doc_id}')
    print(f'Transform {tf}: Distance={new_doc_dist}')

    if new_doc_id is not None:
        ret_image = image_from_url(images[new_doc_id][1])['image']
    else:
        ret_image = None

    return imageTransformed, ret_image, new_doc_id, new_doc_dist
Ejemplo n.º 7
0
def s3ToDB(objs, url_prefix, img_model, docs):
    from analyzer import image_from_url, doc2vec

    for f in objs['Contents']:
        url = url_prefix + f['Key']
        # urls += [url]

        content_type = requests.get(url).headers['Content-Type']
        print(f['Key'], content_type)
        # better check for content-type
        if content_type[:5] == 'image':
            try:
                # fails with pngs
                img = image_from_url(url)
                img_bytes = img['image']
                image_vec = img_model.extract_feature(img_bytes)

                doc = default_db_doc(
                    has_image=True, image_vec=image_vec.tolist())
                docs.insert_one(doc)
            except Exception as e:
                print('error', e)
                continue

            print('added image: ', doc['doc_id'])

        elif content_type[:4] == 'text':
            text = requests.get(url).text
            if len(text) == 0:
                continue

            textvec, lang = doc2vec(text)

            doc = default_db_doc(has_text=True, text=text,
                                 lang=lang, text_vec=textvec.tolist())
            docs.insert_one(doc)

            print('added text: ', doc['doc_id'])
Ejemplo n.º 8
0
def find_text():
    data = request.get_json(force=True)
    image_url = data.get('image_url')
    image_dict = image_from_url(image_url)
    return jsonify(detect_text(image_dict['image_bytes']))
Ejemplo n.º 9
0
def find_duplicate():
    data = request.get_json(force=True)
    text = data.get('text', None)
    thresh = data.get('threshold')
    sources = data.get('sources', [])
    image_url = data.get('image_url', None)
    if text is None and image_url is None:
        ret = {'failed': 1, 'error': 'No text or image_url found'}

    elif image_url is not None:
        image_dict = image_from_url(image_url)
        image = image_dict['image']
        image = image.convert('RGB')  #take care of png(RGBA) issue
        vec = resnet18.extract_feature(image)
        if thresh:
            doc_ids, dists = imagesearch.search(vec, thresh)
        else:
            doc_ids, dists = imagesearch.search(vec)
        sources = {
            d.get('doc_id'): d.get('source')
            for d in db.docs.find({"doc_id": {
                "$in": doc_ids
            }})
        }

        if doc_ids is not None:
            result = [{
                'doc_id': doc_ids[i],
                'dist': dists[i],
                'source': sources[doc_ids[i]]
            } for i in range(min(10, len(doc_ids)))]
            ret = {'failed': 0, 'result': result}
        else:
            ret = {'failed': 0, 'result': []}

    elif text is not None:
        duplicate_doc = db.docs.find_one({"text": text})
        vec = doc2vec(text)
        if vec is None:
            ret = {'failed': 1, 'error': 'query words not found in db'}
        doc_ids, dists = textsearch.search(vec)
        sources = {
            d.get('doc_id'): d.get('source')
            for d in db.docs.find({"doc_id": {
                "$in": doc_ids
            }})
        }

        if doc_ids is not None:
            result = [{
                'doc_id': doc_ids[i],
                'dist': dists[i],
                'source': sources[doc_ids[i]]
            } for i in range(min(10, len(doc_ids)))]
        else:
            result = []

        if duplicate_doc is not None:
            result = [{
                'doc_id': duplicate_doc.get('doc_id'),
                'dist': 0.0,
                'source': duplicate_doc.get('source')
            }] + result

        ret = {'failed': 0, 'duplicate': 1, 'result': result}

    else:
        ret = {'failed': 1, 'error': 'something went wrong'}

    return jsonify(ret)