raw_annotation_files = glob.glob(os.path.join('/home/zwei/Dev/AttributeNet3/AdobeStockSelection/RetrieveSelected778/selected_keywords_retrieve_v2', '*.json')) predefined_vocabularies = set(loadpickle('/home/zwei/Dev/AttributeNet3/AdobeStockSelection/RetrieveSelected778/data_v2/tag_frequencies_selected.pkl').keys()) valid_annotation_list = loadpickle('/home/zwei/Dev/AttributeNet3/AdobeStockSelection/RetrieveSelected778/data_v2/CNNsplit_train.pkl') train_cid_list = [] for s_item in tqdm.tqdm(valid_annotation_list, desc="Processing image cids"): train_cid_list.append(int(get_image_cid_from_url(s_item[0], location=1))) train_cid_set = set(train_cid_list) processedCIDs = set() vocabularies = set() bad_vocabularies = set() for s_file in tqdm.tqdm(raw_annotation_files): keyword_raw_annotations = load_json_list(s_file) for s_annotation in keyword_raw_annotations: s_cid = s_annotation['cid'] if s_cid not in train_cid_set or s_cid in processedCIDs: continue processedCIDs.add(s_cid) s_tags = remove_hat(s_annotation['tags']) for s_tag in s_tags: s_tag = s_tag.lower() if s_tag in vocabularies or s_tag in bad_vocabularies: continue else: if s_tag in predefined_vocabularies: vocabularies.add(s_tag) elif len(wordnet.synsets(s_tag))>=1:
image_cids = [] for s_image_annotation in tqdm.tqdm(image_cid_list, desc="Create CID set"): image_cids.append( int(get_image_cid_from_url(s_image_annotation[0], location=1))) image_cids = set(image_cids) raw_annotation_files = glob.glob( os.path.join( '/home/zwei/Dev/AttributeNet3/AdobeStockSelection/RetrieveSelected778/selected_keywords_retrieve_v2', '*.json')) processed_images = {} tag_counts = {} max_len = 0 for s_raw_annotation_file in tqdm.tqdm(raw_annotation_files): s_raw_annotations = load_json_list(s_raw_annotation_file) for s_raw_annotation in s_raw_annotations: s_cid = s_raw_annotation['cid'] if s_cid in processed_images or s_cid not in image_cids: continue else: s_tags = remove_hat(s_raw_annotation['tags']) for s_tag in s_tags: if s_tag in tag_counts: tag_counts[s_tag] += 1 else: tag_counts[s_tag] = 1 if len(s_tags) > max_len: max_len = len(s_tags) processed_images[s_cid] = s_tags
import tqdm, os from collections import Counter from PyUtils.json_utils import load_json_list from PyUtils.file_utils import get_stem dataset_directory = '/home/zwei/datasets/PublicEmotion/EMOTIC' data_split = 'test' text_annotation_file = os.path.join(dataset_directory, 'annotations/samples', '{}.txt'.format(data_split)) annotaitons_person_crop = [] annotaitons_whole_image = [] raw_annotation_list = load_json_list(text_annotation_file)[0] for s_annotation in tqdm.tqdm(raw_annotation_list, desc="Processing data"): s_file_name = os.path.join(s_annotation['folder'], s_annotation['filename']) s_file_name_stem = get_stem(s_file_name) s_file_extension = s_file_name.split('.')[-1] if isinstance(s_annotation['person'], list): pass else: s_annotation['person'] = [s_annotation['person']] for s_person_idx, s_person in enumerate(s_annotation['person']): s_bbox = s_person['body_bbox'] annotated_categories = []
# from PyUtils.pickle_utils import loadpickle import json import argparse import glob input_file = '/home/zwei/Dev/PastDatasets/EmotionStock/Urls/list_for_download.json' parser = argparse.ArgumentParser() parser.add_argument('--start_idx', '-b', default=0, type=int, help="Start Idx") parser.add_argument('--end_idx', '-e', default=None, type=int) parser.add_argument('--input_file', '-i', default=input_file, type=str, help='Annotation File directory') parser.add_argument('--output_directory', '-t', default='tmp', type=str, help='output filename') parser.add_argument('--nworkers', '-n', default=2, type=int, help="Number of workers") args = parser.parse_args() annotations = load_json_list(args.input_file) assert 'url' in annotations[0], "url is not an attributes of json file" start_idx = args.start_idx or 0 end_idx = args.end_idx or len(annotations) if end_idx>len(annotations): end_idx = len(annotations) selected_annotations = annotations[start_idx:end_idx] print("Downloading {0} Json File Records".format(len(selected_annotations))) MAX_attempt = 3 if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) log_file = os.path.join(args.output_directory,'errlog-{0}-{1}-256.txt'.format(start_idx, end_idx)) download_tuples = []