Ejemplo n.º 1
0
def delete_pic_and_index(label_file,
                         bucket_name,
                         key_prefix,
                         index,
                         es_ip,
                         es_port,
                         s3_only=False):
    """
    Delete all picture listed in label file from s3 and delete index passed in parameters
    :param label_file:      [str]   Path to the labelfile, json format
    :param bucket_name:     [str]   Name of the bucket containing the pictures
    :param key_prefix:      [str]   key prefix to find the picture
    :param index:           [str]   Index to delete
    :param es_ip:           [str]   Ip of the Elastic host
    :param es_port:         [str]   port opened for Elastic
    :param s3_only:         [bool]  If True, only delete picture from s3
    """
    d_label = utils_fct.get_label_dict_from_file(label_file)
    l_pic_id = list(d_label.keys())
    l_pic_s3_key = [
        s3_utils.get_s3_formatted_bucket_path(bucket_name, key_prefix,
                                              pic_id)[2] for pic_id in l_pic_id
    ]
    log.info(f'Deleting {len(l_pic_s3_key)} picture(s) in "{bucket_name}"')
    s3_utils.delete_object_s3(bucket_name, l_pic_s3_key)
    if not s3_only:
        log.info(f'Deleting index "{index}" from {es_ip}:{es_port}')
        es_utils.delete_index(index, es_ip, es_port)
Ejemplo n.º 2
0
def remove_label_to_delete_from_dict(d_label):
    """
    Removed label in a dictionary of labels that have a "to_delete" field set to True. Label are removed from the
    dictionary in place.
    :param d_label:     [dict]          Dictionary of labels where the key is the img_id the label points to and the
                                        value is the label itself:
                                        {
                                          "20200115T15-45-55-123456": {...},
                                          ...
                                        }
    :return:            [list of dict]  List of item removed from the input dictionary:
                                        [{"img_id": "xxx", "s3_key": "xxx", "label_fingerprint": "xxx"}, {...}, ...]
    """
    l_removed_item = []
    for img_id in list(d_label.keys()):
        label = d_label[img_id]
        try:
            to_delete = label["to_delete"]
            if not to_delete:
                label.pop("to_delete")
        except KeyError:
            to_delete = False
        if to_delete:
            d_label.pop(img_id)
            _, _, s3_key = s3_utils.get_s3_formatted_bucket_path(
                label["s3_bucket"], "", label["img_id"])
            l_removed_item.append({
                "img_id":
                img_id,
                "s3_key":
                s3_key,
                "label_fingerprint":
                label["label_fingerprint"]
            })
    return l_removed_item
Ejemplo n.º 3
0
def test_get_s3_formatted_bucket_path_double_nokey_nofilename():
    res = s3_utils.get_s3_formatted_bucket_path("my-bucket/", "")
    assert res == ("my-bucket/", "my-bucket", "")
Ejemplo n.º 4
0
def test_get_s3_formatted_bucket_path_double_slash():
    res = s3_utils.get_s3_formatted_bucket_path(
        "my-bucket/", "/sub/bucket//directory/with/typo")
    assert res == ("my-bucket/sub/bucket/directory/with/typo/", "my-bucket",
                   "sub/bucket/directory/with/typo/")
Ejemplo n.º 5
0
def test_get_s3_formatted_bucket_key_prefix_in_bucket_plus_file():
    res = s3_utils.get_s3_formatted_bucket_path("my-bucket/key/prefix", "",
                                                "/file")
    assert res == ("my-bucket/key/prefix/file", "my-bucket", "key/prefix/file")
Ejemplo n.º 6
0
def test_get_s3_formatted_bucket_path_double_nokey_with_filename():
    res = s3_utils.get_s3_formatted_bucket_path("my-bucket", "",
                                                "key_prefix/file.jpg")
    assert res == ("my-bucket/key_prefix/file.jpg", "my-bucket",
                   "key_prefix/file.jpg")
Ejemplo n.º 7
0
def upload_to_db(label_file,
                 es_host_ip,
                 es_port,
                 es_index,
                 bucket_name=None,
                 key_prefix=None,
                 overwrite=False):
    """
    Upload picture(s) to the DataBase according to the label file, json format.
    Labels are uploaded to Elasticsearch cluster ; pictures are uploaded to S3 bucket. The label file and the pictures
    shall be in the same folder.
    Label file shall be in json format. It can contain one document or a list of document. Each document shall at least
    have the following fields:
    [
      {
        "img_id": "xxx",
        "file_name": "file-name.jpg",
        "label_fingerprint": "c072a1b9a16b633d6b3004c3edab7553",
        "event": "event_name"
      }
    ]
    then you can add any field you wish to labelized the picture.
    Note: if the picture already exists in s3 and is not overwritten, upload to ES will be tried anyway.
    :param label_file:      [string]    path to the file containing the labels in json format
    :param es_host_ip:      [string]    Public ip of the Elasticsearch host server
    :param es_port:         [int]       Port open for Elasticsearch on host server (typically 9200)
    :param es_index:        [string]    Name of the index to use
    :param bucket_name:     [string]    Name of the s3 bucket. If None, picture won't be uploaded to S3.
    :param overwrite:       [bool]      If True, new picture will overwrite existing ones in S3 with same img_id and new
                                        label will overwrite existing one in ES with same label_fingerprint
                                        If False (default), only non existing picture and label will be uploaded
    :param key_prefix:      [string]    If None, default key is used. Default key is as follow:
                                        {event_name}/{upload_date}/
                                        So the picture will be uploaded to:
                                        "https://s3.amazonaws.com/{my-bucket}/{event_name}/{picture_date}/"
    :return:                [tuple]     (int) s3 success upload, (int) ES success upload, (int) total nb of failed upload
    """
    picture_folder = Path(label_file).parent
    d_label = utils_fct.get_label_dict_from_file(label_file)
    utils_fct.remove_label_to_delete_from_dict(d_label)
    log.debug(
        f'Label file "{label_file}" loaded. {len(d_label)} picture(s) and/or label(s) to upload.'
    )
    if d_label is None:
        return 0, 0, 0
    total_label = len(d_label)
    if bucket_name is not None:
        log.debug(f'Looking for pictures...')
        missing_pic = _remove_missing_pic_from_dic(d_label,
                                                   picture_dir=picture_folder)
        if len(d_label) == 0:
            return 0, 0, total_label
        if key_prefix is None:
            key_prefix = generate_key_prefix(d_label)
            if key_prefix is None:
                return 0, 0, total_label
        upload_bucket_dir, bucket_name, key_prefix = s3_utils.get_s3_formatted_bucket_path(
            bucket_name, key_prefix)
        log.debug(f'Uploading to s3...')
        s3_upload_success, already_exist_pic = s3_utils.upload_to_s3_from_label(
            d_label,
            picture_dir=picture_folder,
            s3_bucket_name=bucket_name,
            prefix=key_prefix,
            overwrite=overwrite)
        utils_fct.edit_label(d_label, "s3_bucket", upload_bucket_dir)
        utils_fct.edit_label(d_label, "upload_date",
                             datetime.now().strftime("%Y%m%dT%H-%M-%S-%f"))
    else:
        upload_bucket_dir = None
        s3_upload_success = missing_pic = already_exist_pic = []
    log.debug(f'Uploading to Elasticsearch cluster...')
    failed_es_upload = es_utils.upload_to_es(d_label=d_label,
                                             index=es_index,
                                             host_ip=es_host_ip,
                                             port=es_port,
                                             overwrite=overwrite)
    es_success = len(d_label) - len(failed_es_upload)
    _print_upload_synthesis(upload_bucket_dir,
                            es_index, es_success, failed_es_upload,
                            len(s3_upload_success), missing_pic,
                            already_exist_pic)
    return len(s3_upload_success), es_success, len(failed_es_upload) + len(
        already_exist_pic) + len(missing_pic)