class DBHandler(object): def __init__(self, rootDir, fsmapFn, db_host='localhost', db_port=27017, xml_config=None): self.rootDir = os.path.realpath(os.path.abspath(rootDir)) self.fsmapFn = fsmapFn self.db_host = db_host self.db_port = db_port self.parser = Parser(xml_config) if xml_config is not None else None self.extensions = ['.xml', '.jpg', '.tiff'] # to ensure safe operation on fsmap self.fsmap_lock = threading.Lock() self.fsMap = self._load() self._traverse() self._save() # for debuggin... # lazy connection to MongoDB server # Must ensure mongod is running! self.client = pymongo.MongoClient(self.db_host, self.db_port) self.clientPool = {} # streaming queues self.fs_event_q = Queue() self.stream_q = Queue() # old map # This keeps old fsmap information when file system changes manually # e.g. folder move, rename, etc # If it is not empty dictionary, there is a bug.... self._old_fsmap = {} def __del__(self): for _, h in self.clientPool.items(): h.close() self.client.close() def _load(self): if not os.path.exists(self.fsmapFn): return {} try: with open(self.fsmapFn) as f: data = json.load(f) except (FileNotFoundError, TypeError, json.decoder.JSONDecodeError): print('[WARN] Failed to load saved fsmap, {}!!!'.format( self.fsmapFn)) print('[WARN] Previous fsmap will be ignored, if there is.') return {} def __recursive_flatten(fsmap: dict, flattened: dict): item = dict(fsmap) item['children'] = [ __recursive_flatten(child, flattened) for child in item['children'] ] flattened[item['path']] = item t = {} for key, value in data.items(): __recursive_flatten(value, t) return t def _save(self): def __convert_to_hierarchical_format(key: str, fsmap: dict): item = dict(fsmap[key]) item['children'] = [ __convert_to_hierarchical_format(c, fsmap) for c in item['children'] ] return item t = {} p_keys = [ key for key, value in self.fsMap.items() if value['parent'] is None ] for key in p_keys: t[key] = __convert_to_hierarchical_format(key, self.fsMap) with open(self.fsmapFn, 'w') as f: json.dump(t, f, indent=2, sort_keys=True) def _traverse(self, save_old=False): """Traverse root directory""" fsmap = {} for dirpath, _, _ in os.walk(self.rootDir, followlinks=True): path = dirpath.replace(self.rootDir, '') tokens = path.split(os.sep)[1:] parent_path = os.path.join(self.rootDir, *tokens[:-1]) real_path = os.path.realpath(dirpath) if len(path) == 0: name = dirpath parent = None else: name = os.path.basename(path) fsmap[parent_path]['children'].append(dirpath) parent = fsmap[parent_path]['path'] fsmap[dirpath] = { 'path': dirpath, # absolute path to current directory 'realpath': real_path, # realpath for symlink 'name': name, # name of current directory for display 'children': [], # list of absolute pathes of direct children directories 'parent': parent, # absolute path to direct parent directory 'link': None, # linked path # valid path flag # It will turn into False, if the given path doesn't exist by # comparing with fsmap in the file. 'valid': True, # valid path flag # This set to Ture, once a client set the `db` field. # Then, `db` filed can be modified only manually via fsmap file. # Such modification requires to re-run the web server. 'db': None, # related database (db, collection) 'fixed': False, # can modify? # used for syncing 'file': None, # sample file name used to determine group name 'sep': None, # separator used to parse group name from the file 'group': None, # group name in this folder 'last_sync': None, # the last date and time sync is applied } # update for symlink for key, value in fsmap.items(): if not (key == value['realpath']): if value['realpath'] in fsmap: fsmap[value['realpath']]['link'] = key value['link'] = fsmap[value['realpath']]['path'] # save unregistered fsmap from old one if save_old: for key, value in self.fsMap.items(): if key not in fsmap: self._old_fsmap[key] = dict(value) _keys_to_copy = [ 'valid', 'db', 'fixed', 'file', 'sep', 'group', 'last_sync' ] def __merge_fsmap(dstMap: dict, srcMap: dict): for _path, _srcItem in srcMap.items(): if _path in dstMap: # Is parent same? yes, it must be same as key is the absolute path. # But children could be different. For example, one might delete/move/add # sub-directories. But, we do not care, here. _dstItem = dstMap[_path] for _k in _keys_to_copy: _dstItem[_k] = _srcItem[_k] else: # This branch can happen when one delete/move/add subdirectories. # Keep it, so that one can fix it manually in the json file. _srcItem['children'] = [] _srcItem['parent'] = None _srcItem['valid'] = False #srcItem['inSync'] = False dstMap[key] = _srcItem __merge_fsmap(fsmap, self.fsMap) self.fsMap = fsmap def _update_fsmap(self, event_type, src_path, dst_path): """Invoked when filesystem changes (only for directory changes)""" with self.fsmap_lock: if event_type in ['created', 'deleted']: # on create and delete operation, refresh entire fsmap self._traverse() self._save() elif event_type in ['moved'] and dst_path is not None: # moved event includes 'rename' and 'relocate a folder' cp_key = ['db', 'file', 'fixed', 'group', 'last_sync', 'sep'] self._traverse(True) if src_path in self._old_fsmap and dst_path in self.fsMap: old_item = self._old_fsmap[src_path] new_item = self.fsMap[dst_path] for k, v in old_item.items(): if k in cp_key: new_item[k] = v del self._old_fsmap[src_path] else: print('Error in handling DirMovedEvent: ', src_path, dst_path) def _db_key(self, _db, _col, _fs): _key = '{:s}::{:s}::{:s}'.format(_db, _col, _fs) return _key def _db_key_list(self, path, recursive, isUnique=False): _key_list = [] def __recursive_db(_path, fsmap): if _path not in fsmap: return _db = fsmap[_path]['db'] if _db is None: return _key = self._db_key(_db[0], _db[1], _db[2]) if not isUnique: _key_list.append((_path, _key)) else: if _key not in _key_list: _key_list.append(_key) if recursive: for _c_path in fsmap[_path]['children']: __recursive_db(_c_path, fsmap) __recursive_db(path, self.fsMap) return _key_list def _get_db_handler(self, db_col_fs): _db, _col, _fs = db_col_fs _key = self._db_key(_db, _col, _fs) if _key in self.clientPool: return self.clientPool[_key] else: _h = MultiViewMongo(connection=self.client, db_name=_db, collection_name=_col, fs_name=_fs) self.clientPool[_key] = _h return _h def _get_db_handler_by_key(self, key: str): if key in self.clientPool: return self.clientPool[key] else: tokens = key.split('::') _h = MultiViewMongo(connection=self.client, db_name=tokens[0], collection_name=tokens[1], fs_name=tokens[2]) self.clientPool[key] = _h return _h def _update_file(self, event_type, src_path, dst_path): """Invoked when files change By watchdog: By syncer: """ if self.parser is None: print('parser is not set.') return None if dst_path is None: _path = src_path path, filename = os.path.split(src_path) else: _path = dst_path path, filename = os.path.split(dst_path) if len(filename) == 0: print('fail to detect filename.') return None ext = os.path.splitext(filename)[1] if len(ext) == 0 or ext not in self.extensions: print('Unsupported extension type. {:s}'.format(ext)) return None if path not in self.fsMap: print("Path is not in fsmap. {:s}".format(path)) return None if self.fsMap[path]['db'] is None: print("DB is not set on this path. {:s}".format(path)) return None if self.fsMap[path]['group'] is None: print("Group name is not set to this path. {:s}".format(path)) return None db = self.fsMap[path]['db'] group = self.fsMap[path]['group'] if event_type in ['created', 'modified', 'syncing', 'moved']: doc = self.parser.run(_path, ext, group) if doc is None: return None h = self._get_db_handler(db) if h.save_one(doc, ext) == 0: return None if ext == '.xml': query = {"sample": group, "item": doc['item']} res = h.load(query=query, fields={}, getarrays=False) res = self.after_query(res) return json.dumps(res) elif event_type in ['deleted']: # currently we do not delete any document in the db (should we?) pass else: # unknown event_type pass return None def _add_fs_event(self, what, event_type, src_path, dst_path): """Invoked by observer and syncers""" self.fs_event_q.put((what, event_type, src_path, dst_path)) def get_fsmap_as_list(self): """ Used to return the lastes file system information. Always, first scan file system itself to detect any changes made in the file system by someone else. """ with self.fsmap_lock: self._traverse() fsmap_list = [[key, value] for key, value in self.fsMap.items() if value['valid']] return fsmap_list def set_fsmap(self, fsmap_list): """Used to set db config by a client""" with self.fsmap_lock: for path, value in fsmap_list: # path is not found # (can happen when file system is manually changed) if path not in self.fsMap: continue # db is already set by other clients, ignore this. # Only administrator can change this manually. if self.fsMap[path]['fixed']: continue # check db config a client set if value['db'] is None: continue # db is not set if len(value['db']) != 3: continue # must be 3-D array new_db = value['db'][0] new_col = value['db'][1] if len(new_db) == 0 or len(new_col) == 0: continue # in-complete setting if new_db == 'null' or new_col == 'null': continue # in-complete setting # update db config item = self.fsMap[path] item['db'] = [new_db, new_col, 'fs'] item['fixed'] = True self._save() # def get_sync_samples(self, path, recursive): # """ # This is called to initiate syncing operation. # Args: # path: # recursive: # # Returns: # # """ # if path not in self.fsMap: return [] # if not os.path.exists(path): return [] # # sample_files = {} # for dirpath, _, files in os.walk(path, followlinks=True): # for f in files: # name, ext = os.path.splitext(f) # if ext in self.extensions: # sample_files[dirpath] = name # break # # if not recursive: break # return sample_files # def set_sync_info(self, info:dict): # """update `inSync` and `sep` fields in fsmap""" # # with self.fsmap_lock: # responses = {} # for path, sep in info.items(): # resp = { # 'valid': Syncer.CAN_SYNC # } # if path in self.fsMap: # item = self.fsMap[path] # if item['inSync']: # resp['valid'] = Syncer.CANNOT_SYNC # elif item['db'] is None or len(item['db']) != 3: # resp['valid'] = Syncer.NO_DB # else: # item['inSync'] = True # item['sep'] = sep # else: # resp['valid'] = Syncer.NO_PATH # responses[path] = resp # # self._save() # # return responses # def run_syncer(self, resp:dict): # """run syncer, some information will be added to resp""" # # files_to_sync = [] # for path, info in resp.items(): # if info['valid']: # item = { # 'path': path, # 'files': [], # 'client': self.get_client(self.get_db(path)) # } # for _, _, files in os.walk(path): # item['files'] = [f for f in files # if os.path.splitext(f)[1] in self.extensions] # break # files_to_sync.append(item) # info['total'] = len(item['files']) # else: # info['total'] = 0 # info['progressed'] = 0 # # # create syncer # syncer_id = Syncer.generate_syncer_id() # #syncer = Syncer(items_to_sync=files_to_sync) # # # update pool # #self.syncerPool[syncer_id] = syncer # # # run syncer # #syncer.start() # # return syncer_id, resp # def get_client(self, db_collection_fs): # if db_collection_fs is None or len(db_collection_fs) != 3: # return None # # db = db_collection_fs[0] # col = db_collection_fs[1] # fs = db_collection_fs[2] # key = '{}:{}:{}'.format(db, col, fs) # if key in self.clientPool: # h = self.clientPool[key] # else: # h = MultiViewMongo( # connection=self.client, # db_name=db, # collection_name=col, # fs_name=fs # ) # self.clientPool[key] = h # return h # def set_db(self, path, db, col): # if path not in self.fsMap: # return False # # def __recursive_update(key: str, fsmap: dict): # item = fsmap[key] # if item['db'] is None: item['db'] = [db, col, 'fs'] # for child in item['children']: # __recursive_update(child, fsmap) # # # update db setting recursively # # If a path is already set before (or maybe by other client), # # it didn't modify it. Given path may be not set as a client wants. # with self.fsmap_lock: # __recursive_update(path, self.fsMap) # self._save() # # return True # def get_db(self, path): # db = None # with self.fsmap_lock: # if path in self.fsMap: # db = self.fsMap[path]['db'] # return db def after_query(self, res): """Post processor on queried results""" if not isinstance(res, list): res = [res] res = [replace_objid_to_str(doc) for doc in res] res = [flatten_dict(doc) for doc in res] # for doc in res: # doc['sample'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['sample']) # doc['_id'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['_id']) return res def get_samplelist(self, path, recursive): if path not in self.fsMap: return [] samplelist = {} db_key_list = self._db_key_list(path, recursive) _db_list = self.client.list_database_names() for _path, _key in db_key_list: _db, _col, _fs = _key.split("::") if _db not in _db_list: continue _col_list = self.client[_db].collection_names() if _col not in _col_list: continue h = self._get_db_handler_by_key(_key) pipeline = [{ "$match": { "path": _path } }, { "$match": { "sample": { "$exists": True, "$ne": None } } }, { "$group": { "_id": "$sample", "count": { "$sum": 1 } } }] res = list(h.collection.aggregate(pipeline)) for r in res: _id = r['_id'] _count = r['count'] if _id in samplelist: samplelist[_id] += _count else: samplelist[_id] = _count return samplelist def get_samples(self, names, path, recursive): if path not in self.fsMap: return {} sampleData = {} db_key_list = self._db_key_list(path, recursive, False) _db_list = self.client.list_database_names() for _path, _key in db_key_list: _db, _col, _fs = _key.split("::") if _db not in _db_list: continue _col_list = self.client[_db].collection_names() if _col not in _col_list: continue h = self._get_db_handler_by_key(_key) for name in names: query = {"sample": name, "path": _path} res = h.load(query=query, fields={}, getarrays=False) if res is None: continue res = self.after_query(res) if name in sampleData: sampleData[name].append(res) else: sampleData[name] = res return sampleData def get_tiff(self, id, path): if path not in self.fsMap: return [] if self.fsMap[path]['db'] is None: return [] db = self.fsMap[path]['db'] h = self._get_db_handler(db) try: _id = ObjectId(id) except InvalidId: return [] query = {'_id': _id, 'tiff': {'$exists': True}} fields = {'tiff': 1, '_id': 0} res = h.load(query, fields, getarrays=True) if res is None: return [] data = res['tiff']['data'] res['tiff']['data'] = data.tolist() return res['tiff']
] test_files = [ 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.xml', 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.jpg', 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.tiff' ] print('Add data to DB') for dir, file in zip(data_dir, test_files): print(file) ext = os.path.splitext(file)[1][1:] doc = parser.run(os.path.join(dir, file), kind=ext, sample_name='test_sample', project_name='test_project') if ext == 'xml': save_document(colCursor, doc) else: save_image_document(colCursor, fsCursor, doc, ext) print('Retrieve xml data') xml = load_xml(colCursor, 'test_sample', 'test_project') pp.pprint(xml) print('Retrieve jpg data') jpg = load_image(colCursor, fsCursor, "5bad2bdcd511eb8fef5ccd3f", 'jpg') pp.pprint(jpg)