def tbl_log(db): log.info("-----------------------------") tblname = annonutils.get_tblname('LOG') logs = db.get_collection(tblname) cur = logs.find({}) rpt = { 'total_items': 0, 'unique_rel_filenames': set(), 'all_rel_filenames': [], 'total_unique_rel_filenames': 0, 'total_rel_filenames': 0 } for item in cur: rpt['unique_rel_filenames'].add(item['rel_filename']) rpt['all_rel_filenames'].append(item['rel_filename']) rpt['total_items'] += 1 rpt['total_unique_rel_filenames'] = len(rpt['unique_rel_filenames']) rpt['total_rel_filenames'] = len(rpt['all_rel_filenames']) log.debug("=> len(unique_rel_filenames): {}".format( len(rpt['unique_rel_filenames']))) log.debug("=> total_unique_rel_filenames: {}".format( rpt['total_unique_rel_filenames'])) log.debug("total_rel_filenames: {}".format(rpt['total_rel_filenames'])) log.debug("total_items: {}".format(rpt['total_items'])) log.debug("---x---x---x---\n") return rpt
def create_modelinfo(args, cfg, db): log.info("----------------------------->") from_path = args.from_path if not from_path: raise Exception('from_path not defined') if not os.path.exists(from_path) or not os.path.isfile(from_path): raise Exception('File does not exists: {}'.format(from_path)) ##TODO: for the entire directory data = common.loadcfg(from_path) if data and len(data) > 0: data = {k.lower():v for k,v in data.items()} ## TODO: empty data and other sanity checks created_on = common.now() timestamp = common.timestamp_from_datestring(created_on) uuid = common.createUUID('uuid') data['uuid'] = uuid data['created_on'] = created_on data['timestamp'] = timestamp tblname = annonutils.get_tblname('MODELINFO') # annonutils.create_unique_index(db, tblname, 'created_on') annonutils.create_unique_index(db, tblname, 'weights_path') collection = db.get_collection(tblname) collection.update_one( {'created_on': data['created_on']} ,{'$setOnInsert': data} ,upsert=True )
def tbl_release(db): log.info("-----------------------------") tblname = annonutils.get_tblname('RELEASE') logs = db.get_collection(tblname) rpt = {'total_items': 0, 'rel_ids': [], 'total_annon_file_processed': 0} if logs: cur = logs.find({}, {'_id': 0}) if cur: for item in cur: rel = {} rel[item['rel_id']] = item['timestamp'] rpt['rel_ids'].append(rel) rpt['total_annon_file_processed'] += item[ 'total_annon_file_processed'] rpt['total_items'] += 1 log.debug("=> len(rel_ids): {}".format(len(rpt['rel_ids']))) log.debug("rel_ids: {}".format(rpt['rel_ids'])) log.debug("total_annon_file_processed: {}".format( rpt['total_annon_file_processed'])) log.debug("total_items: {}".format(rpt['total_items'])) log.debug("---x---x---x---\n") return rpt
def tbl_modelinfo(db): log.info("-----------------------------") tblname = annonutils.get_tblname('MODELINFO') modelinfo = db.get_collection(tblname) rpt = { 'total_items': 0, 'total_annon_file_processed': 0, 'model_ids': None, 'weights_path': None } if modelinfo: items = list(modelinfo.find({}, {'_id': 0})) rpt['total_items'] = len(items) rpt['model_ids'] = [o['uuid'] for o in items] rpt['weights_path'] = [o['weights_path'] for o in items] log.debug("items: {}".format(items)) log.debug("len(items): {}".format(rpt['total_items'])) log.debug("model_ids: {}".format(rpt['model_ids'])) log.debug("weights_path: {}".format(rpt['weights_path'])) log.debug("---x---x---x---\n") return rpt
def tbl_classinfo(db): log.info("-----------------------------") tblname = annonutils.get_tblname('CLASSINFO') classinfo = db.get_collection(tblname) cur = classinfo.find() rpt = { 'total_items': 0, 'unique_lbl_ids': set(), 'lbl_ids': [], 'total_unique_lbl_ids': 0, 'total_lbl_ids': 0 } lbl_ids = [] for item in cur: rpt['unique_lbl_ids'].add(item['lbl_id']) rpt['lbl_ids'].append(item['lbl_id']) rpt['total_items'] += 1 rpt['total_unique_lbl_ids'] = len(rpt['unique_lbl_ids']) rpt['total_lbl_ids'] = len(rpt['lbl_ids']) log.debug("=> len(unique_lbl_ids): {}".format(len(rpt['unique_lbl_ids']))) log.debug("=> total_unique_lbl_ids: {}".format( rpt['total_unique_lbl_ids'])) log.debug("total_lbl_ids: {}".format(rpt['total_lbl_ids'])) log.debug("total_items: {}".format(rpt['total_items'])) log.debug("---x---x---x---\n") return rpt
def save_Label(cfg, Label, dst_dir=None, db=None): if len(Label) > 0: ## TODO # tblname = 'LABELS' # annonutils.write2db(db, tblname, list(Label.values())) tblname = annonutils.get_tblname('CLASSINFO') colors = common.random_colors(len(Label)) class_info = annonutils.get_class_info(Label, colors=colors) log.info("len(Label): {}".format(len(Label))) log.info("Label: {}".format(Label)) save_to_file = cfg['SAVE_TO_FILE'] if save_to_file: lbl_filename = os.path.join(dst_dir,os.path.basename(dst_dir)+'-'+cfg['FILES']['LABELS']) log.info("lbl_filename: {}".format(lbl_filename)) # db[tblname].insert_many(list(Label.values())) with open(lbl_filename,'w') as fw: fw.write(json.dumps(Label)) classinfo_filename = os.path.join(dst_dir,os.path.basename(dst_dir)+'-'+cfg['FILES']['CLASSINFO']) log.info("classinfo_filename, tblname: {}, {}".format(classinfo_filename, tblname)) with open(classinfo_filename,'w') as fw: json.dump(class_info,fw) else: log.info("tblname: {}".format(tblname)) annonutils.write2db(db, tblname, class_info, idx_col='lbl_id')
def _create_modelinfo(from_path, dbname, db): data = common.loadcfg(from_path) if data and len(data) > 0: data = {k.lower():v for k,v in data.items()} ## TODO: empty data and other sanity checks created_on = common.now() timestamp = common.timestamp_from_datestring(created_on) uuid = common.createUUID('uuid') data['uuid'] = uuid data['created_on'] = created_on data['timestamp'] = timestamp data['filename'] = from_path.split(os.path.sep)[-1] data['filepath'] = from_path data['dbname'] = dbname data['rel_num'] = str(data['rel_num']) try: tblname = annonutils.get_tblname('MODELINFO') # annonutils.create_unique_index(db, tblname, 'created_on') annonutils.create_unique_index(db, tblname, 'weights_path') collection = db.get_collection(tblname) collection.update_one( {'created_on': data['created_on']} ,{'$setOnInsert': data} ,upsert=True ) except pymongo.errors.PyMongoError as e: print(e.details) return uuid
def get_info(args, cfg, db): log.info("----------------------------->") aids_data = None tblname = annonutils.get_tblname('AIDS') collection = db.get_collection(tblname) if collection: aids_data = list(collection.find({},{'_id':False,'aids_dbname':True ,'aids_id':True, 'annon_type':True, 'classes':True})) aids_data = list(collection.find({},{'_id':False})) return aids_data
def tbl_aids(db): """ TODO - cmd as user input which can be train, evaluate, predict, publish, report """ log.info("-----------------------------") tblname = annonutils.get_tblname('AIDS') aids = db.get_collection(tblname) rpt = { 'total_items': 0, 'dbnames': [], 'dbnames_with_exp_id': {}, 'items': [], 'total_annon_file_processed': 0, 'stats': None } cmd = 'train' if cmd not in rpt['dbnames_with_exp_id']: rpt['dbnames_with_exp_id'][cmd] = [] if aids: # cur = aids.find({},{'_id':0, 'anndb_id':1, 'anndb_id':1, 'dbname':1, 'classes':1}) cur = aids.find({}, {'_id': 0}) ## https://stackoverflow.com/questions/36229123/return-only-matched-sub-document-elements-within-a-nested-array ## query = {'train':{'$elemMatch':{'uuid':'exp-1e329cfa-2156-491f-b41a-171e62284cf6'}}},{'_id':0,'dbname':1,'train.$':1} ## aids.find(query) if cur: for item in cur: rpt['items'].append(item) rpt['dbnames'].append(item['dbname']) exp = item[cmd] exp_id = None if len(exp) > 0: exp_id = [o['uuid'] for o in exp] x = {} x[item['dbname']] = exp_id rpt['dbnames_with_exp_id'][cmd].append(x) rpt['total_items'] += 1 log.debug("=> len(dbnames): {}".format(len(rpt['dbnames']))) # log.debug('items: {}'.format(items)) log.debug("dbnames: {}".format(rpt['dbnames'])) log.debug("total_items: {}".format(rpt['total_items'])) log.debug("cmd, dbnames_with_exp_id: {}, {}".format( cmd, rpt['dbnames_with_exp_id'][cmd])) log.debug("---x---x---x---\n") return rpt
def save_Image(cfg, Image, dst_dir=None, db=None): if len(Image) > 0: tblname = annonutils.get_tblname('IMAGES') save_to_file = cfg['SAVE_TO_FILE'] if save_to_file: img_filename = os.path.join(dst_dir,os.path.basename(dst_dir)+'-'+cfg['FILES']['IMAGES']) log.info("img_filename, tblname: {}, {}".format(img_filename, tblname)) with open(img_filename,'w') as fw: fw.write(json.dumps(Image)) else: log.info("tblname: {}".format(tblname)) annonutils.write2db(db, tblname, list(Image.values()), idx_col='img_id')
def classinfo_from_modelinfo(mongodb, dbname, filepath): modelinfo = common.yaml_load(filepath) log.info("modelinfo: {}".format(modelinfo)) lbl_ids = modelinfo['classes'] classinfo = [{ 'lbl_id': lbl_id, 'source': 'hmd', 'name': lbl_id } for lbl_id in lbl_ids[1:]] tblname = annonutils.get_tblname('CLASSINFO') mongodb.connect(dbname) mongodb.write(tblname, classinfo, idx_col='lbl_id')
def save_Annotation_Info(cfg, Annotation_Info, dst_dir=None, db=None): if len(Annotation_Info) > 0: tblname = annonutils.get_tblname('ANNOTATIONS') json_str = common.numpy_to_json(Annotation_Info) # log.info("json_str: {}".format(json_str)) save_to_file = cfg['SAVE_TO_FILE'] if save_to_file: ant_filename = os.path.join(dst_dir,os.path.basename(dst_dir)+'-'+cfg['FILES']['ANNOTATIONS']) log.info("ant_filename, tblname: {}, {}".format(ant_filename, tblname)) with open(ant_filename,'w') as fw: # fw.write(json.dumps(Annotation_Info)) fw.write(json_str) else: log.info("tblname: {}".format(tblname)) annonutils.write2db(db, tblname, list(json.loads(json_str).values()), idx_col='ant_id')
def save_Error(cfg, Error, dst_dir=None, db=None): if len(Error) > 0: # log.info("Error:\n{}".format(Error)) tblname = annonutils.get_tblname('ERRORS') save_to_file = cfg['SAVE_TO_FILE'] if save_to_file: err_filename = os.path.join(dst_dir,os.path.basename(dst_dir)+'-'+cfg['FILES']['ERRORS']) log.info("err_filename, tblname: {}, {}".format(err_filename, tblname)) # db[tblname].insert_many(list(Error.values())) with open(err_filename,'w') as fw: fw.write(json.dumps(Error)) else: log.info("tblname: {}".format(tblname)) # annonutils.write2db(db, tblname, list(Error.values()), idx_col='rel_filename') annonutils.write2db(db, tblname, list(Error.values()), idx_col='rel_filepath')
def save_to_annon_db(cfg, aidsdata): """Save to Annotation DB """ DBCFG = cfg['DBCFG'] mclient = MongoClient('mongodb://'+DBCFG['HOST']+':'+str(DBCFG['PORT'])) db = mclient[DBCFG['DBNAME']] tblname = annonutils.get_tblname('AIDS') annonutils.create_unique_index(db, tblname, 'created_on') collection = db.get_collection(tblname) collection.update_one( {'created_on': aidsdata['created_on']} ,{'$setOnInsert': aidsdata} ,upsert=True ) mclient.close()
def save_to_annon_db(cfg, aidsdata): """Save to Annotation DB """ DBCFG = cfg['DBCFG'] ANNONCFG = DBCFG['ANNONCFG'] mclient = MongoClient('mongodb://' + ANNONCFG['host'] + ':' + str(ANNONCFG['port'])) dbname = ANNONCFG['dbname'] log.info("ANNONCFG['dbname']: {}".format(dbname)) db = mclient[dbname] tblname = annonutils.get_tblname('AIDS') annonutils.create_unique_index(db, tblname, 'created_on') collection = db.get_collection(tblname) collection.update_one({'created_on': aidsdata['created_on']}, {'$setOnInsert': aidsdata}, upsert=True) mclient.close()
def tbl_annotations(db): log.info("-----------------------------") tblname = annonutils.get_tblname('ANNOTATIONS') stats = db.get_collection(tblname) cur = stats.find({}) rpt = { 'total_items': 0, 'total_ant': 0, 'unique_labels': set(), 'unique_images': set(), 'total_images': set(), 'unique_rel_filenames': set(), 'total_unique_rel_filenames': 0 } for item in cur: rpt['unique_rel_filenames'].add(item['rel_filename']) rpt['unique_labels'].add(item['lbl_id']) rpt['unique_images'].add(item['image_name']) rpt['total_images'].add(item['img_id']) rpt['total_ant'] += 1 rpt['total_items'] += 1 rpt['unique_labels'] = len(rpt['unique_labels']) rpt['unique_images'] = len(rpt['unique_images']) rpt['total_images'] = len(rpt['total_images']) rpt['total_unique_rel_filenames'] = len(rpt['unique_rel_filenames']) log.debug("=> len(unique_rel_filenames): {}".format( len(rpt['unique_rel_filenames']))) log.debug("=> total_unique_rel_filenames: {}".format( rpt['total_unique_rel_filenames'])) log.debug("* total_ant: {}".format(rpt['total_ant'])) log.debug("** len(total_images): {}".format(rpt['total_images'])) log.debug("** len(unique_images): {}".format(rpt['unique_images'])) log.debug("** len(unique_labels): {}".format(rpt['unique_labels'])) log.debug("---x---x---x---\n") return rpt
def save_Stats(cfg, Stats, Total_Stats, dataset=None, annon_filepath=None, dst_dir=None, db=None): stats_data = json.loads(common.numpy_to_json(Stats)) total_stats_data = json.loads(common.numpy_to_json(Total_Stats)) stats_total_stats_data = common.merge_dict([stats_data, total_stats_data]) tblname = annonutils.get_tblname('STATS') save_to_file = cfg['SAVE_TO_FILE'] if save_to_file: ## Stats files create_stats_files(cfg, Stats, Total_Stats, dst_dir) ## Move processed annotation file to archive folder log.info("annon_filepath, tblname: {}, {}".format(annon_filepath, tblname)) rel_dir = cfg['BASE_PATH']['RELEASE_DIR'] with open(os.path.join(rel_dir, os.path.basename(annon_filepath)),'w') as fw: json.dump(dataset,fw) else: log.info("tblname: {}".format(tblname)) # annonutils.write2db(db, tblname, [stats_total_stats_data], idx_col='rel_filename') annonutils.write2db(db, tblname, [stats_total_stats_data], idx_col='rel_filepath')
def tbl_images(db): log.info("-----------------------------") tblname = annonutils.get_tblname('IMAGES') images = db.get_collection(tblname) cur = images.find({}) rpt = { 'total_items': 0, 'unique_rel_filenames': set(), 'total_unique_rel_filenames': 0, 'unique_images': set(), 'total_images': set(), 'total_img': 0 } for item in cur: rpt['unique_rel_filenames'].add(item['rel_filename']) rpt['unique_images'].add(item['filename']) rpt['total_images'].add(item['img_id']) rpt['total_img'] += 1 rpt['total_items'] += 1 rpt['unique_images'] = len(rpt['unique_images']) rpt['total_images'] = len(rpt['total_images']) rpt['total_unique_rel_filenames'] = len(rpt['unique_rel_filenames']) log.debug("=> len(unique_rel_filenames): {}".format( len(rpt['unique_rel_filenames']))) log.debug("=> total_unique_rel_filenames: {}".format( rpt['total_unique_rel_filenames'])) log.debug('** total_img: {}'.format(rpt['total_img'])) log.debug("len(unique_images): {}".format(rpt['unique_images'])) log.debug("len(total_images): {}".format(rpt['total_images'])) log.debug("---x---x---x---\n") return rpt
def create_db(cfg, args, datacfg, aids): """release the AIDS database i.e. creates the PXL DB (AI Datasets) and create respective entries in AIDS table in annon database """ log.info("-----------------------------") by = args.by splits = datacfg['splits'] DBCFG = cfg['DBCFG'] PXLCFG = DBCFG['PXLCFG'] mclient = MongoClient('mongodb://' + PXLCFG['host'] + ':' + str(PXLCFG['port'])) dbname = 'PXL-' + cfg['TIMESTAMP'] log.info("dbname: {}".format(dbname)) db = mclient[dbname] uuid_aids = None if len(aids) > 0: uuid_aids = common.createUUID('aids') ## Save aids - AI Datasets for split in splits: for tbl in aids[split]: # log.info("aids[{}][{}]".format(split, tbl)) log.info("split: {}".format(split)) if aids[split][tbl] is not None: tblname = annonutils.get_tblname(tbl) log.info("tblname: {}".format(tblname)) log.info("aids[split][tbl]: {}".format( type(aids[split][tbl]))) if isinstance(aids[split][tbl], dict): log.info('dict') data = list(aids[split][tbl].values()) # log.info(aids[split][tbl]['img-19a68326-3468-4b1e-9fc6-5a739523c6f6']) elif isinstance(aids[split][tbl], list): log.info('list') data = aids[split][tbl] log.info( "tblname, type(data), len(data): {}, {}, {}".format( tblname, type(data), len(data))) if len(data) > 0: for doc in data: doc['dbid'] = uuid_aids doc['timestamp'] = cfg['TIMESTAMP'] doc['subset'] = split if tblname == 'STATS': log.info('doc: {}'.format(doc)) # log.debug('doc: {}'.format(doc)) annonutils.write2db(db, tblname, doc) created_on = common.now() uuid_rel = common.createUUID('rel') datacfg['dbid'] = uuid_aids datacfg['dbname'] = dbname datacfg['created_on'] = created_on datacfg['modified_on'] = None datacfg['anndb_id'] = dbname datacfg['timestamp'] = cfg['TIMESTAMP'] datacfg['anndb_rel_id'] = None datacfg['rel_id'] = uuid_rel datacfg['log_dir'] = dbname datacfg['rel_type'] = 'aids' datacfg['creator'] = by.upper() log.info("datacfg: {}".format(datacfg)) tblname = annonutils.get_tblname('AIDS') annonutils.create_unique_index(db, tblname, 'created_on') collection = db.get_collection(tblname) collection.update_one({'created_on': datacfg['created_on']}, {'$setOnInsert': datacfg}, upsert=True) tblname = annonutils.get_tblname('CLASSINFO') collection = db.get_collection(tblname) annonutils.write2db(db, tblname, datacfg['classinfo'], idx_col='lbl_id') save_to_annon_db(cfg, aidsdata=datacfg) ## TODO: ## generate STATS, STATSLABEL and respective SUMMARY csv files for AIDS mclient.close() return dbname
def release_db(cfg, args): """Entry point to parse VIA based annotations for creating and saving basic data structures - IMAGES, ANNOTATIONS, LABELS and related data Implements the DRC - Design Rule Checks and acts as a gatekeeper, also reports any possible errors Create data structures to be parsed in 2nd pass to create the AIDS - AI Datasets with the actual splits Test Cases: ## /some/path/AIML_Annotation/ods_job_230119/annotations/images-p1-230119_AT1_via205_250119.json ## /some/path/AIML_Annotation/ods_job_230119/annotations/ ## /some/path/AIML_Annotation/ods_job_230119/annotations/ """ ## Check required args for d in ['from_path']: if d not in args: log.info("'{}' is not present.\n".format(d)) sys.exit(-1) if not os.path.exists(args.from_path): raise NotADirectoryError("{}".format(args.from_path)) dbname = None if 'to_path' in args and not os.path.exists(args.to_path): dbname = args.to_path from_path = args.from_path tic = time.time() log.info("\nrelease_db:-----------------------------") base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) uuid_rel = common.createUUID('rel') timestamp = cfg['RELEASE']['COLS']['timestamp'] = cfg['LOG']['COLS']['timestamp'] = cfg['TIMESTAMP'] cfg['RELEASE']['COLS']['rel_id'] = cfg['LOG']['COLS']['rel_id'] = uuid_rel cfg['SAVE_TO_FILE'] = False log.info("-------") log.info("cfg: {}".format(cfg)) if os.path.isdir(from_path): ## normalizes and takes care of path ending with slash or not as the user input files = glob.glob(os.path.join(base_from_path, cfg['ANNON_FILENAME_PREFIX']+'*.json')) else: files = [from_path] total_files = len(files) log.info("-------") log.debug("\nfiles: {}".format(files)) log.info("-------") log.info("\nTotal files to process =======>: {}".format(total_files)) total_annon_file_processed = 0 total_annon_file_existed = 0 DBCFG = cfg['DBCFG'] ANNONCFG = DBCFG['ANNONCFG'] mclient = MongoClient('mongodb://'+ANNONCFG['host']+':'+str(ANNONCFG['port'])) dbname = ANNONCFG['dbname'] if not dbname else dbname log.info("dbname: {}".format(dbname)) db = mclient[dbname] rel_tblname = annonutils.get_tblname('RELEASE') annonutils.create_unique_index(db, rel_tblname, 'rel_id') rel_collection = db.get_collection(rel_tblname) log_tblname = annonutils.get_tblname('LOG') annonutils.create_unique_index(db, log_tblname, 'created_on') log_collection = db.get_collection(log_tblname) for annon_filepath in files: log.info("-------") tic2 = time.time() annon_filename = os.path.basename(annon_filepath) ## check if the file is parsed: skip the processing in normal mode of the already parsed file # res = log_collection.find_one({'rel_filename': annon_filename}) res = log_collection.find_one({'rel_filepath': annon_filepath}) ## TODO: in update mode ## delete the entries of annotations and images before inserting the values of the same file again if not res: log.info(" annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) created_on = common.now() cfg['RELEASE']['COLS']['created_on'] = cfg['LOG']['COLS']['created_on'] = created_on log.info("created_on: {}".format(created_on)) cfg['LOG']['COLS']['rel_filename'] = annon_filename cfg['LOG']['COLS']['rel_filepath'] = annon_filepath annondata = annon_parser.parse_annon_file(cfg, annon_filepath, base_from_path) total_annon_file_processed += 1 save_parsed_data(cfg, annondata, db=db) cfg['LOG']['COLS']['modified_on'] = None toc2 = time.time() cfg['LOG']['COLS']['total_exec_time'] = '{:0.2f}s'.format(toc2 - tic) ## if exception occurs or terminate, save what has been processed so for in the log instead of one-shot update of log out of for loop ## this helps to recover from the abrupt termination and start from previous successfuly processed file log_collection.update_one( {'created_on': created_on} ,{'$setOnInsert': cfg['LOG']['COLS']} ,upsert=True ) log.info("=======> Total Execution Time: {:0.2f}s, Processed files: {}, Remaning files: {}".format(toc2 - tic2, total_annon_file_processed, total_files - total_annon_file_processed)) ## Update the LOG table here itself else: log.info("Already Exist in Database: annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) log.info("Use update / delete command to process this file again") total_annon_file_existed += 1 cfg['RELEASE']['COLS']['total_annon_file_processed'] = total_annon_file_processed # cfg['RELEASE']['COLS']['total_exec_time'] = '{:0.2f}s'.format(time.time() - tic) cfg['RELEASE']['COLS']['total_exec_time_in_sec'] = '{:0.2f}'.format(time.time() - tic) if total_annon_file_processed: rel_collection.update_one( {'rel_id': uuid_rel} ,{'$setOnInsert': cfg['RELEASE']['COLS']} ,upsert=True ) log.info("total_files, total_annon_file_processed, total_annon_file_existed: {} = {} + {}".format(total_files, total_annon_file_processed, total_annon_file_existed)) mclient.close() return timestamp
def create_db(cfg, args, datacfg): """release the AIDS database i.e. creates the PXL DB (AI Datasets) and create respective entries in AIDS table in annon database """ log.info("-----------------------------") by = args.by db_images, db_annon, latest_release_info, lbl_ids = get_annon_data(cfg) aids, datacfg = prepare_aids(cfg, db_images, db_annon, lbl_ids, datacfg) DBCFG = cfg['DBCFG'] mclient = MongoClient('mongodb://'+DBCFG['HOST']+':'+str(DBCFG['PORT'])) rel_timestamp = latest_release_info['timestamp'] DBNAME = 'PXL-'+rel_timestamp+'_'+cfg['TIMESTAMP'] log.info("DBNAME: {}".format(DBNAME)) db = mclient[DBNAME] uuid_aids = None if len(aids) > 0: uuid_aids = common.createUUID('aids') AIDS_SPLITS_CRITERIA = cfg['AIDS_SPLITS_CRITERIA'][cfg['AIDS_SPLITS_CRITERIA']['USE']] splits = AIDS_SPLITS_CRITERIA[0] ## directory names ## Save aids - AI Datasets for split in splits: for tbl in aids[split]: log.info("aids[{}][{}]".format(split, tbl)) tblname = annonutils.get_tblname(tbl) log.info("tblname: {}".format(tblname)) log.info("aids[split][tbl]: {}".format(type(aids[split][tbl]))) if isinstance(aids[split][tbl], dict): log.info('dict') data = list(aids[split][tbl].values()) # log.info(aids[split][tbl]['img-19a68326-3468-4b1e-9fc6-5a739523c6f6']) elif isinstance(aids[split][tbl], list): log.info('list') data = aids[split][tbl] log.info("tblname, type(data): {}, {}".format(tblname, type(data))) for doc in data: # if tblname == 'STATS': # log.info(doc) doc['dbid'] = uuid_aids doc['timestamp'] = cfg['TIMESTAMP'] doc['subset'] = split annonutils.write2db(db, tblname, doc) created_on = common.now() uuid_rel = common.createUUID('rel') datacfg['dbid'] = uuid_aids datacfg['dbname'] = DBNAME datacfg['created_on'] = created_on datacfg['modified_on'] = None datacfg['anndb_id'] = rel_timestamp datacfg['timestamp'] = cfg['TIMESTAMP'] datacfg['anndb_rel_id'] = latest_release_info['rel_id'] datacfg['rel_id'] = uuid_rel datacfg['log_dir'] = DBNAME datacfg['rel_type'] = 'aids' datacfg['creator'] = by.upper() tblname = annonutils.get_tblname('AIDS') annonutils.create_unique_index(db, tblname, 'created_on') collection = db.get_collection(tblname) collection.update_one( {'created_on': datacfg['created_on']} ,{'$setOnInsert': datacfg} ,upsert=True ) tblname = annonutils.get_tblname('CLASSINFO') collection = db.get_collection(tblname) annonutils.write2db(db, tblname, datacfg['classinfo'], idx_col='lbl_id') save_to_annon_db(cfg, aidsdata=datacfg) ## TODO: ## generate STATS, STATSLABEL and respective SUMMARY csv files for AIDS mclient.close() return uuid_aids
def release_files(cfg, args): """Entry point to parse VIA based annotations for creating and saving basic data structures - IMAGES, ANNOTATIONS, LABELS and related data Implements the DRC - Design Rule Checks and acts as a gatekeeper, also reports any possible errors Create data structures to be parsed in 2nd pass to create the AIDS - AI Datasets with the actual splits Test Cases: ## /some/path/AIML_Annotation/ods_job_230119/annotations/images-p1-230119_AT1_via205_250119.json ## /some/path/AIML_Annotation/ods_job_230119/annotations/ ## /some/path/AIML_Annotation/ods_job_230119/annotations/ """ ## Check required args for d in ['from_path', 'to_path']: if d not in args: log.info("'{}' is not present.\n".format(d)) sys.exit(-1) if not os.path.exists(args.from_path): raise NotADirectoryError("{}".format(args.from_path)) if not os.path.exists(args.to_path): raise NotADirectoryError("{}".format(args.to_path)) from_path, to_path = args.from_path, args.to_path tic = time.time() log.info("\nrelease_db:-----------------------------") cfg['TIMESTAMP'] = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) base_to_path = common.getBasePath(to_path) log.info("base_to_path: {}".format(base_to_path)) cfg['LOG']['COLS']['timestamp'] = cfg['TIMESTAMP'] ## Create Base Directories db_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['DB']) log.info("db_dir: {}".format(db_dir)) common.mkdir_p(db_dir) db_data_dir = os.path.join(db_dir, cfg['TIMESTAMP']) log.info("ANNDB db_data_dir: {}".format(db_data_dir)) common.mkdir_p(db_data_dir) rel_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['RELEASE'], cfg['TIMESTAMP']) log.info("rel_dir: {}".format(rel_dir)) common.mkdir_p(rel_dir) log_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['LOG']) log.info("log_dir: {}".format(log_dir)) common.mkdir_p(log_dir) ant_data_dir = os.path.join(db_data_dir,cfg["BASEDIR_NAME"]["ANNON"]) log.info("ant_data_dir: {}".format(ant_data_dir)) common.mkdir_p(ant_data_dir) cfg['BASE_PATH']['DB_DIR'] = db_dir cfg['BASE_PATH']['DB_DATA_DIR'] = db_data_dir cfg['BASE_PATH']['RELEASE_DIR'] = rel_dir cfg['BASE_PATH']['LOG_DIR'] = log_dir cfg['BASE_PATH']['ANT_DATA_DIR'] = ant_data_dir log.info("-------") log.info("cfg: {}".format(cfg)) if os.path.isdir(from_path): ## normalizes and takes care of path ending with slash or not as the user input files = glob.glob(os.path.join(base_from_path,cfg['ANNON_FILENAME_PREFIX']+'*.json')) else: files = [from_path] log.info("-------") log.info("\nfiles: {}".format(files)) log.info("-------") log.info("\nTotal files to process =======>: {}".format(len(files))) total_annon_file_processed = 0 log_tblname = annonutils.get_tblname('LOG') for annon_filepath in files: log.info("-------") tic2 = time.time() annon_filename = os.path.basename(annon_filepath) ## TODO: check if the file is parsed: skip the processing in normal mode of the already parsed file res = False ## TODO: in update mode ## delete the entries of annotations and images before inserting the values of the same file again if not res: log.info(" annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) created_on = cfg['LOG']['COLS']['created_on'] = common.now() log.info("created_on: {}".format(created_on)) cfg['LOG']['COLS']['rel_filename'] = annon_filename cfg['LOG']['COLS']['rel_filename'] = annon_filepath annondata = annon_parser.parse_annon_file(cfg, annon_filepath, base_from_path) total_annon_file_processed += 1 # ## if the annon_filepath is absolute path, base_bast gets ignored and thus the dst_dir is the file's directory ## dst_dir= os.path.join(base_from_path,os.path.splitext(annon_filepath)[0]) ## log.info("annon_filepath: {}".format(annon_filepath)) ## dst_dir = os.path.join(db_dir,os.path.splitext(annon_filepath)[0]) ## dst_dir = os.path.join(db_dir,os.path.splitext(annon_filepath)[0]) dst_dir = os.path.join(rel_dir, os.path.splitext(annon_filename)[0]) ## log.info("dst_dir: {}".format(dst_dir)) common.mkdir_p(dst_dir) save_parsed_data(cfg, annondata, dst_dir=dst_dir, ant_data_dir=ant_data_dir, annon_filepath=annon_filepath) cfg['LOG']['COLS']['modified_on'] = None toc2 = time.time() total_exec_time = '{:0.2f}s'.format(toc2 - tic) cfg['LOG']['COLS']['total_exec_time'] = total_exec_time ##TODO: ## if exception occurs or terminate, save what has been processed so for in the log instead of one-shot update of log out of for loop ## this helps to recover from the abrupt termination and start from previous successfuly processed file log.info("=======> Total Execution Time: {:0.2f}s, Processed files: {}, Remaning files: {}".format(toc2 - tic2, total_annon_file_processed, len(files) - total_annon_file_processed)) ## Update the LOG table here itself else: log.info("Already Exist in Database: annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) log.info("Use update / delete command to process this file again") ## Every execution of the script is a release ## For every release, recreate the entire database either for directory or specific file release ## create and save db data i.e. consolidated data with index structure db_data = create_db_data(cfg, rel_dir) save_db_data(cfg, db_dir, db_data) log.info("total_annon_file_processed: {}".format(total_annon_file_processed)) return db_data_dir
def tbl_errors(db): log.debug("\nErrors::") log.debug("--------") tblname = annonutils.get_tblname('ERRORS') stats = db.get_collection(tblname) cur = stats.find({'has_error': True}) rpt = { 'total_items': 0, 'total_error_ant': 0, 'total_error_empty_ant': 0, 'total_error_img_notfound': 0, 'total_error_img_reading': 0, 'total_error_unlabeled_ant': 0, 'total_error_in_rel_filename': 0, 'errors_for_reporting': None, 'unique_rel_filenames': set(), 'all_rel_filenames': [], 'total_unique_rel_filenames': 0, 'total_rel_filenames': 0 } errors_for_reporting = {} for item in cur: rpt['unique_rel_filenames'].add(item['rel_filename']) rpt['all_rel_filenames'].append(item['rel_filename']) rpt['total_error_ant'] += item['total_error_ant'] rpt['total_error_empty_ant'] += item['total_error_empty_ant'] rpt['total_error_img_notfound'] += item['total_error_img_notfound'] rpt['total_error_img_reading'] += item['total_error_img_reading'] rpt['total_error_unlabeled_ant'] += item['total_error_unlabeled_ant'] rpt['total_error_in_rel_filename'] += 1 rpt['total_items'] += 1 if item['has_error']: error_types = appcfg['ERROR_TYPES'] rel_filename = os.path.splitext(item['rel_filename'])[0] if rel_filename not in errors_for_reporting: errors_for_reporting[rel_filename] = {} errors_for_reporting[rel_filename]['rel_filename'] = item[ 'rel_filename'] for et in error_types: if et in item and len(item[et]) > 0: errors_for_reporting[rel_filename][et] = item[et] rpt['errors_for_reporting'] = errors_for_reporting rpt['total_unique_rel_filenames'] = len(rpt['unique_rel_filenames']) rpt['total_rel_filenames'] = len(rpt['all_rel_filenames']) log.debug("=> len(unique_rel_filenames): {}".format( len(rpt['unique_rel_filenames']))) log.debug("=> total_unique_rel_filenames: {}".format( rpt['total_unique_rel_filenames'])) log.debug("total_rel_filenames: {}".format(rpt['total_rel_filenames'])) log.debug('total_error_ant: {}'.format(rpt['total_error_ant'])) log.debug('total_error_empty_ant: {}'.format(rpt['total_error_empty_ant'])) log.debug('total_error_img_notfound: {}'.format( rpt['total_error_img_notfound'])) log.debug('total_error_img_reading: {}'.format( rpt['total_error_img_reading'])) log.debug('total_error_unlabeled_ant: {}'.format( rpt['total_error_unlabeled_ant'])) log.debug('total_error_in_rel_filename: {}'.format( rpt['total_error_in_rel_filename'])) log.debug("---x---x---x---\n") return rpt
def create_experiment(args, cfg, db): log.info("----------------------------->") from_path = args.from_path if not from_path: raise Exception('--from not defined') dbid = args.to if not dbid: raise Exception('--to not defined') exp_type = args.exp if not exp_type: raise Exception('--exp not defined') if not os.path.exists(from_path) or not os.path.isfile(from_path): raise Exception('File does not exists: {}'.format(from_path)) ##TODO: for the entire directory data = common.loadcfg(from_path) if data and len(data) > 0: data = {k.lower():v for k,v in data.items()} # teppr_items = cfg['TEPPR_ITEMS'] # for item in teppr_items: # data[item] ## TODO: empty data and other sanity checks if exp_type in data: data = data[exp_type] if data and len(data) > 0: data = {k.lower():v for k,v in data.items()} ## TODO: empty data and other sanity checks created_on = common.now() timestamp = common.timestamp_from_datestring(created_on) uuid = common.createUUID('uuid') data['uuid'] = uuid data['created_on'] = created_on data['timestamp'] = timestamp log_dir = os.path.join(data['dnnarch'], timestamp) data['log_dir'] = log_dir tblname = annonutils.get_tblname('AIDS') collection = db.get_collection(tblname) # aids_data = list(collection.find({'aids_id':from_path},{'_id':False})) expdata = {} expdata[exp_type] = data ## {'train':data} log.info("data:{}".format(expdata)) ## TODO if collection does not exist raise error # if collection: collection.update_one( {'dbid': dbid} ,{'$push': expdata} ) res = { 'dbid': dbid ,'exp_id': uuid } return res
def create_experiment(args, cfg): log.info("----------------------------->") from_path = args.from_path dbname = args.to exp_type = args.exp DBCFG = cfg['DBCFG'] PXLCFG = DBCFG['PXLCFG'] mclient = MongoClient('mongodb://'+PXLCFG['host']+':'+str(PXLCFG['port'])) check_args('experiment', args, cfg) expdata = common.loadcfg(from_path) if expdata and len(expdata) > 0: expdata = {k.lower():v for k,v in expdata.items()} creator = 'AIE3' if 'creator' in expdata: creator = expdata['creator'] if exp_type in expdata: expdata = expdata[exp_type] if expdata and len(expdata) > 0: expdata = {k.lower():v for k,v in expdata.items()} modelinfo_abspath = os.path.join(os.getenv('AI_CFG'), 'model') modelinfo_filepath = os.path.join(modelinfo_abspath, expdata['model_info']) args.from_path = modelinfo_filepath check_args('modelinfo', args, cfg) created_on = common.now() timestamp = common.timestamp_from_datestring(created_on) uuid = common.createUUID(exp_type) expdata['uuid'] = uuid expdata['created_on'] = created_on expdata['timestamp'] = timestamp expdata['creator'] = creator expdata['filename'] = from_path.split(os.path.sep)[-1] expdata['filepath'] = from_path expdata['dbname'] = dbname log_dir = os.path.join(expdata['dnnarch'], timestamp) expdata['log_dir'] = log_dir modelinfo = common.loadcfg(modelinfo_filepath) if modelinfo and len(modelinfo) > 0: modelinfo = {k.lower():v for k,v in modelinfo.items()} modelinfo['uuid'] = uuid modelinfo['created_on'] = created_on modelinfo['timestamp'] = timestamp modelinfo['filename'] = expdata['model_info'] modelinfo['filepath'] = modelinfo_filepath expdata['modelinfo'] = modelinfo log.info("expdata:{}".format(expdata)) db = mclient[dbname] tblname = annonutils.get_tblname(exp_type.upper()) collection = db.get_collection(tblname) collection.update_one( {'created_on': expdata['created_on']} ,{'$setOnInsert': expdata} ,upsert=True ) aidsdata = {} aidsdata[exp_type] = uuid tblname = annonutils.get_tblname('AIDS') collection = db.get_collection(tblname) collection.update_one( {'dbname': dbname} ,{'$push': aidsdata} ) mclient.close() return uuid
def load_data_from_db(self): """Load the annotation data from the database TODO - plit is replaced with subset """ log.info("-------------------------------->") import pymongo from pymongo import MongoClient import arrow dbcfg = self.dbcfg log.debug("dbcfg: {}".format(dbcfg)) mclient = MongoClient('mongodb://' + dbcfg['host'] + ':' + str(dbcfg['port'])) dbname = dbcfg['dbname'] db = mclient[dbname] query_annotations = {} query_images = {} query_classinfo = {} if self.subset: query_annotations = {'subset': self.subset} query_images = {'subset': self.subset} tblname = annonutils.get_tblname('ANNOTATIONS') annotations = db.get_collection(tblname) annotations_data = list(annotations.find(query_annotations, {'_id': 0})) self.dataset['annotations'] = annotations_data tblname = annonutils.get_tblname('IMAGES') images = db.get_collection(tblname) images_data = list(images.find(query_images, {'_id': 0})) self.dataset['images'] = images_data tblname = annonutils.get_tblname('CLASSINFO') classinfo = db.get_collection(tblname) ## sorting is critical to avoid label mismatch issues ## https://stackoverflow.com/questions/8109122/how-to-sort-mongodb-with-pymongo classinfo = list( classinfo.find(query_classinfo, { '_id': 0 }).sort('lbl_id', pymongo.ASCENDING)) # lbl_ids = [] # for item in classinfo: # lbl_ids.append(item['lbl_id']) # log.info('lbl_ids: {}'.format(lbl_ids)) # lbl_ids.sort() # log.info('len(lbl_ids): {}'.format(len(lbl_ids))) # log.info('lbl_ids: {}'.format(lbl_ids)) log.info("classinfo: {}".format(classinfo)) self.dataset['categories'] = classinfo ## get RELEASE data self.dataset['release'] = None tblname = annonutils.get_tblname('RELEASE') collection = db.get_collection(tblname) if collection: release = list( collection.find({'rel_type': 'annon'}, {'_id': False})) log.info("len(release): {}".format(len(release))) ## 'YYYY-MM-DD HH:mm:ss ZZ' release.sort(key=lambda x: arrow.get(x['created_on'], common. _date_format_).date(), reverse=True) self.dataset['release'] = release mclient.close()
def tbl_stats(db): log.info("-----------------------------") tblname = annonutils.get_tblname('STATS') stats = db.get_collection(tblname) cur = stats.find({}) rpt = { 'total_items': 0, 'total_ant': 0, 'total_error_ant': 0, 'total_error_img_notfound': 0, 'total_error_img_reading': 0, 'total_error_unlabeled_ant': 0, 'total_img': 0, 'total_lbl': 0, 'unique_rel_filenames': set(), 'all_rel_filenames': [], 'total_unique_rel_filenames': 0, 'total_rel_filenames': 0, 'error_rel_filenames': set() } for item in cur: rpt['unique_rel_filenames'].add(item['rel_filename']) rpt['all_rel_filenames'].append(item['rel_filename']) rpt['total_error_ant'] += item['total_error_ant'] rpt['total_error_img_notfound'] += item['total_error_img_notfound'] rpt['total_error_img_reading'] += item['total_error_img_reading'] rpt['total_error_unlabeled_ant'] += item['total_error_unlabeled_ant'] rpt['total_img'] += item['total_img'] rpt['total_lbl'] += item['total_lbl'] rpt['total_ant'] += item['total_ant'] rpt['total_items'] += 1 if item['total_ant'] == 0: rpt['error_rel_filenames'].add(item['rel_filename']) total_ant_type = item['total_ant_type'][0] for ant_type in total_ant_type.keys(): if 'total_ant_' + ant_type not in rpt: rpt['total_ant_' + ant_type] = 0 rpt['total_ant_' + ant_type] += total_ant_type[ant_type] rpt['total_ant'] -= rpt['total_error_unlabeled_ant'] - rpt[ 'total_error_ant'] rpt['total_unique_rel_filenames'] = len(rpt['unique_rel_filenames']) rpt['total_rel_filenames'] = len(rpt['all_rel_filenames']) log.debug("=> len(unique_rel_filenames): {}".format( len(rpt['unique_rel_filenames']))) log.debug("=> total_unique_rel_filenames: {}".format( rpt['total_unique_rel_filenames'])) log.debug("total_rel_filenames: {}".format(rpt['total_rel_filenames'])) log.debug( '* total_ant - total_error_unlabeled_ant - total_error_ant: {}'.format( rpt['total_ant'])) log.debug('** total_img: {}'.format(rpt['total_img'])) log.debug('total_error_ant: {}'.format(rpt['total_error_ant'])) log.debug('total_error_img_notfound: {}'.format( rpt['total_error_img_notfound'])) log.debug('total_error_img_reading: {}'.format( rpt['total_error_img_reading'])) log.debug('total_error_unlabeled_ant: {}'.format( rpt['total_error_unlabeled_ant'])) log.debug('total_lbl: {}'.format(rpt['total_lbl'])) log.debug("---x---x---x---\n") return rpt
def get_annon_data_bk(cfg): """ return actual data hash of all images, all allotations (annon), release info and classinfo (categories/labels) """ query_images = {} query_annotations = {} query_classinfo = {} filter_enable = cfg['AIDS_FILTER']['ENABLE'] if filter_enable: # filter_by = cfg['AIDS_FILTER']['LABELS'] filter_by = cfg['AIDS_FILTER'][ cfg['AIDS_FILTER']['BY'] ] DBCFG = cfg['DBCFG'] mclient = MongoClient('mongodb://'+DBCFG['HOST']+':'+str(DBCFG['PORT'])) db = mclient[DBCFG['DBNAME']] ## get IMAGES data tblname = annonutils.get_tblname('IMAGES') collection = db.get_collection(tblname) images = np.array(list(collection.find(query_images, {'_id':False}))) log.info("len(images): {}".format(len(images))) # images = {item['img_id']:item for item in images} ## get ANNOTATIONS data tblname = annonutils.get_tblname('ANNOTATIONS') collection = db.get_collection(tblname) annotations = list(collection.find(query_annotations, {'_id':False})) log.info("len(annotations): {}".format(len(annotations))) annon = {item['ant_id']:item for item in annotations} ## get RELEASE data tblname = annonutils.get_tblname('RELEASE') collection = db.get_collection(tblname) release = list(collection.find({'rel_type':'annon'}, {'_id':False})) log.info("len(release): {}".format(len(release))) ## 'YYYY-MM-DD HH:mm:ss ZZ' release.sort(key = lambda x: arrow.get(x['created_on'], common._date_format_).date(), reverse=True) latest_release_info = release[0] # wanted_keys = ['rel_id','timestamp','created_on'] # release_info = {k: latest_release_info[k] for k in set(wanted_keys) & set(latest_release_info.keys())} ## get CLASSINFO (labels) data tblname = annonutils.get_tblname('CLASSINFO') collection = db.get_collection(tblname) classinfo = list(collection.find(query_classinfo, {'_id':False})) lbl_ids = [] for item in classinfo: lbl_ids.append(item['lbl_id']) log.info('lbl_ids: {}'.format(lbl_ids)) lbl_ids.sort() log.info('len(lbl_ids): {}'.format(len(lbl_ids))) log.info('lbl_ids: {}'.format(lbl_ids)) mclient.close() if filter_enable and filter_by and len(filter_by)>0: images = aids_filter(cfg, images, filter_by) return images, annon, latest_release_info, lbl_ids