Beispiel #1
0
class DBHandler(object):
    def __init__(self,
                 rootDir,
                 fsmapFn,
                 db_host='localhost',
                 db_port=27017,
                 xml_config=None):
        self.rootDir = os.path.realpath(os.path.abspath(rootDir))
        self.fsmapFn = fsmapFn
        self.db_host = db_host
        self.db_port = db_port

        self.parser = Parser(xml_config) if xml_config is not None else None

        self.extensions = ['.xml', '.jpg', '.tiff']

        # to ensure safe operation on fsmap
        self.fsmap_lock = threading.Lock()

        self.fsMap = self._load()
        self._traverse()
        self._save()  # for debuggin...

        # lazy connection to MongoDB server
        # Must ensure mongod is running!
        self.client = pymongo.MongoClient(self.db_host, self.db_port)
        self.clientPool = {}

        # streaming queues
        self.fs_event_q = Queue()
        self.stream_q = Queue()

        # old map
        # This keeps old fsmap information when file system changes manually
        # e.g. folder move, rename, etc
        # If it is not empty dictionary, there is a bug....
        self._old_fsmap = {}

    def __del__(self):
        for _, h in self.clientPool.items():
            h.close()
        self.client.close()

    def _load(self):
        if not os.path.exists(self.fsmapFn): return {}

        try:
            with open(self.fsmapFn) as f:
                data = json.load(f)
        except (FileNotFoundError, TypeError, json.decoder.JSONDecodeError):
            print('[WARN] Failed to load saved fsmap, {}!!!'.format(
                self.fsmapFn))
            print('[WARN] Previous fsmap will be ignored, if there is.')
            return {}

        def __recursive_flatten(fsmap: dict, flattened: dict):
            item = dict(fsmap)
            item['children'] = [
                __recursive_flatten(child, flattened)
                for child in item['children']
            ]
            flattened[item['path']] = item

        t = {}
        for key, value in data.items():
            __recursive_flatten(value, t)
        return t

    def _save(self):
        def __convert_to_hierarchical_format(key: str, fsmap: dict):
            item = dict(fsmap[key])
            item['children'] = [
                __convert_to_hierarchical_format(c, fsmap)
                for c in item['children']
            ]
            return item

        t = {}
        p_keys = [
            key for key, value in self.fsMap.items() if value['parent'] is None
        ]

        for key in p_keys:
            t[key] = __convert_to_hierarchical_format(key, self.fsMap)
        with open(self.fsmapFn, 'w') as f:
            json.dump(t, f, indent=2, sort_keys=True)

    def _traverse(self, save_old=False):
        """Traverse root directory"""
        fsmap = {}
        for dirpath, _, _ in os.walk(self.rootDir, followlinks=True):
            path = dirpath.replace(self.rootDir, '')
            tokens = path.split(os.sep)[1:]
            parent_path = os.path.join(self.rootDir, *tokens[:-1])

            real_path = os.path.realpath(dirpath)
            if len(path) == 0:
                name = dirpath
                parent = None
            else:
                name = os.path.basename(path)
                fsmap[parent_path]['children'].append(dirpath)
                parent = fsmap[parent_path]['path']

            fsmap[dirpath] = {
                'path': dirpath,  # absolute path to current directory
                'realpath': real_path,  # realpath for symlink
                'name': name,  # name of current directory for display
                'children':
                [],  # list of absolute pathes of direct children directories
                'parent': parent,  # absolute path to direct parent directory
                'link': None,  # linked path

                # valid path flag
                # It will turn into False, if the given path doesn't exist by
                #  comparing with fsmap in the file.
                'valid': True,  # valid path flag

                # This set to Ture, once a client set the `db` field.
                # Then, `db` filed can be modified only manually via fsmap file.
                # Such modification requires to re-run the web server.
                'db': None,  # related database (db, collection)
                'fixed': False,  # can modify?

                # used for syncing
                'file': None,  # sample file name used to determine group name
                'sep':
                None,  # separator used to parse group name from the file
                'group': None,  # group name in this folder
                'last_sync': None,  # the last date and time sync is applied
            }

        # update for symlink
        for key, value in fsmap.items():
            if not (key == value['realpath']):
                if value['realpath'] in fsmap:
                    fsmap[value['realpath']]['link'] = key
                    value['link'] = fsmap[value['realpath']]['path']

        # save unregistered fsmap from old one
        if save_old:
            for key, value in self.fsMap.items():
                if key not in fsmap:
                    self._old_fsmap[key] = dict(value)

        _keys_to_copy = [
            'valid', 'db', 'fixed', 'file', 'sep', 'group', 'last_sync'
        ]

        def __merge_fsmap(dstMap: dict, srcMap: dict):
            for _path, _srcItem in srcMap.items():
                if _path in dstMap:
                    # Is parent same? yes, it must be same as key is the absolute path.
                    # But children could be different. For example, one might delete/move/add
                    # sub-directories. But, we do not care, here.
                    _dstItem = dstMap[_path]
                    for _k in _keys_to_copy:
                        _dstItem[_k] = _srcItem[_k]
                else:
                    # This branch can happen when one delete/move/add subdirectories.
                    # Keep it, so that one can fix it manually in the json file.
                    _srcItem['children'] = []
                    _srcItem['parent'] = None
                    _srcItem['valid'] = False
                    #srcItem['inSync'] = False
                    dstMap[key] = _srcItem

        __merge_fsmap(fsmap, self.fsMap)
        self.fsMap = fsmap

    def _update_fsmap(self, event_type, src_path, dst_path):
        """Invoked when filesystem changes (only for directory changes)"""
        with self.fsmap_lock:
            if event_type in ['created', 'deleted']:
                # on create and delete operation, refresh entire fsmap
                self._traverse()
                self._save()
            elif event_type in ['moved'] and dst_path is not None:
                # moved event includes 'rename' and 'relocate a folder'
                cp_key = ['db', 'file', 'fixed', 'group', 'last_sync', 'sep']

                self._traverse(True)
                if src_path in self._old_fsmap and dst_path in self.fsMap:
                    old_item = self._old_fsmap[src_path]
                    new_item = self.fsMap[dst_path]
                    for k, v in old_item.items():
                        if k in cp_key:
                            new_item[k] = v
                    del self._old_fsmap[src_path]
                else:
                    print('Error in handling DirMovedEvent: ', src_path,
                          dst_path)

    def _db_key(self, _db, _col, _fs):
        _key = '{:s}::{:s}::{:s}'.format(_db, _col, _fs)
        return _key

    def _db_key_list(self, path, recursive, isUnique=False):
        _key_list = []

        def __recursive_db(_path, fsmap):
            if _path not in fsmap: return

            _db = fsmap[_path]['db']
            if _db is None: return

            _key = self._db_key(_db[0], _db[1], _db[2])
            if not isUnique:
                _key_list.append((_path, _key))
            else:
                if _key not in _key_list:
                    _key_list.append(_key)

            if recursive:
                for _c_path in fsmap[_path]['children']:
                    __recursive_db(_c_path, fsmap)

        __recursive_db(path, self.fsMap)
        return _key_list

    def _get_db_handler(self, db_col_fs):
        _db, _col, _fs = db_col_fs
        _key = self._db_key(_db, _col, _fs)
        if _key in self.clientPool:
            return self.clientPool[_key]
        else:
            _h = MultiViewMongo(connection=self.client,
                                db_name=_db,
                                collection_name=_col,
                                fs_name=_fs)
            self.clientPool[_key] = _h
            return _h

    def _get_db_handler_by_key(self, key: str):
        if key in self.clientPool:
            return self.clientPool[key]
        else:
            tokens = key.split('::')
            _h = MultiViewMongo(connection=self.client,
                                db_name=tokens[0],
                                collection_name=tokens[1],
                                fs_name=tokens[2])
            self.clientPool[key] = _h
            return _h

    def _update_file(self, event_type, src_path, dst_path):
        """Invoked when files change
            By watchdog:
            By syncer:
        """
        if self.parser is None:
            print('parser is not set.')
            return None
        if dst_path is None:
            _path = src_path
            path, filename = os.path.split(src_path)
        else:
            _path = dst_path
            path, filename = os.path.split(dst_path)

        if len(filename) == 0:
            print('fail to detect filename.')
            return None

        ext = os.path.splitext(filename)[1]
        if len(ext) == 0 or ext not in self.extensions:
            print('Unsupported extension type. {:s}'.format(ext))
            return None

        if path not in self.fsMap:
            print("Path is not in fsmap. {:s}".format(path))
            return None
        if self.fsMap[path]['db'] is None:
            print("DB is not set on this path. {:s}".format(path))
            return None
        if self.fsMap[path]['group'] is None:
            print("Group name is not set to this path. {:s}".format(path))
            return None
        db = self.fsMap[path]['db']
        group = self.fsMap[path]['group']

        if event_type in ['created', 'modified', 'syncing', 'moved']:
            doc = self.parser.run(_path, ext, group)
            if doc is None:
                return None

            h = self._get_db_handler(db)
            if h.save_one(doc, ext) == 0:
                return None

            if ext == '.xml':
                query = {"sample": group, "item": doc['item']}
                res = h.load(query=query, fields={}, getarrays=False)
                res = self.after_query(res)
                return json.dumps(res)

        elif event_type in ['deleted']:
            # currently we do not delete any document in the db (should we?)
            pass
        else:
            # unknown event_type
            pass

        return None

    def _add_fs_event(self, what, event_type, src_path, dst_path):
        """Invoked by observer and syncers"""
        self.fs_event_q.put((what, event_type, src_path, dst_path))

    def get_fsmap_as_list(self):
        """
        Used to return the lastes file system information.
        Always, first scan file system itself to detect any changes made in
        the file system by someone else.
        """
        with self.fsmap_lock:
            self._traverse()
            fsmap_list = [[key, value] for key, value in self.fsMap.items()
                          if value['valid']]
        return fsmap_list

    def set_fsmap(self, fsmap_list):
        """Used to set db config by a client"""
        with self.fsmap_lock:
            for path, value in fsmap_list:
                # path is not found
                # (can happen when file system is manually changed)
                if path not in self.fsMap: continue

                # db is already set by other clients, ignore this.
                # Only administrator can change this manually.
                if self.fsMap[path]['fixed']: continue

                # check db config a client set
                if value['db'] is None: continue  # db is not set
                if len(value['db']) != 3: continue  # must be 3-D array

                new_db = value['db'][0]
                new_col = value['db'][1]
                if len(new_db) == 0 or len(new_col) == 0:
                    continue  # in-complete setting
                if new_db == 'null' or new_col == 'null':
                    continue  # in-complete setting

                # update db config
                item = self.fsMap[path]
                item['db'] = [new_db, new_col, 'fs']
                item['fixed'] = True

            self._save()

    # def get_sync_samples(self, path, recursive):
    #     """
    #     This is called to initiate syncing operation.
    #     Args:
    #         path:
    #         recursive:
    #
    #     Returns:
    #
    #     """
    #     if path not in self.fsMap: return []
    #     if not os.path.exists(path): return []
    #
    #     sample_files = {}
    #     for dirpath, _, files in os.walk(path, followlinks=True):
    #         for f in files:
    #             name, ext = os.path.splitext(f)
    #             if ext in self.extensions:
    #                 sample_files[dirpath] = name
    #                 break
    #
    #         if not recursive: break
    #     return sample_files

    # def set_sync_info(self, info:dict):
    #     """update `inSync` and `sep` fields in fsmap"""
    #
    #     with self.fsmap_lock:
    #         responses = {}
    #         for path, sep in info.items():
    #             resp = {
    #                 'valid': Syncer.CAN_SYNC
    #             }
    #             if path in self.fsMap:
    #                 item = self.fsMap[path]
    #                 if item['inSync']:
    #                     resp['valid'] = Syncer.CANNOT_SYNC
    #                 elif item['db'] is None or len(item['db']) != 3:
    #                     resp['valid'] = Syncer.NO_DB
    #                 else:
    #                     item['inSync'] = True
    #                     item['sep'] = sep
    #             else:
    #                 resp['valid'] = Syncer.NO_PATH
    #             responses[path] = resp
    #
    #         self._save()
    #
    #     return responses

    # def run_syncer(self, resp:dict):
    #     """run syncer, some information will be added to resp"""
    #
    #     files_to_sync = []
    #     for path, info in resp.items():
    #         if info['valid']:
    #             item = {
    #                 'path': path,
    #                 'files': [],
    #                 'client': self.get_client(self.get_db(path))
    #             }
    #             for _, _, files in os.walk(path):
    #                 item['files'] = [f for f in files
    #                                  if os.path.splitext(f)[1] in self.extensions]
    #                 break
    #             files_to_sync.append(item)
    #             info['total'] = len(item['files'])
    #         else:
    #             info['total'] = 0
    #         info['progressed'] = 0
    #
    #     # create syncer
    #     syncer_id = Syncer.generate_syncer_id()
    #     #syncer = Syncer(items_to_sync=files_to_sync)
    #
    #     # update pool
    #     #self.syncerPool[syncer_id] = syncer
    #
    #     # run syncer
    #     #syncer.start()
    #
    #     return syncer_id, resp

    # def get_client(self, db_collection_fs):
    #     if db_collection_fs is None or len(db_collection_fs) != 3:
    #         return None
    #
    #     db = db_collection_fs[0]
    #     col = db_collection_fs[1]
    #     fs = db_collection_fs[2]
    #     key = '{}:{}:{}'.format(db, col, fs)
    #     if key in self.clientPool:
    #         h = self.clientPool[key]
    #     else:
    #         h = MultiViewMongo(
    #             connection=self.client,
    #             db_name=db,
    #             collection_name=col,
    #             fs_name=fs
    #         )
    #         self.clientPool[key] = h
    #     return h

    # def set_db(self, path, db, col):
    #     if path not in self.fsMap:
    #         return False
    #
    #     def __recursive_update(key: str, fsmap: dict):
    #         item = fsmap[key]
    #         if item['db'] is None: item['db'] = [db, col, 'fs']
    #         for child in item['children']:
    #             __recursive_update(child, fsmap)
    #
    #     # update db setting recursively
    #     # If a path is already set before (or maybe by other client),
    #     # it didn't modify it. Given path may be not set as a client wants.
    #     with self.fsmap_lock:
    #         __recursive_update(path, self.fsMap)
    #         self._save()
    #
    #     return True

    # def get_db(self, path):
    #     db = None
    #     with self.fsmap_lock:
    #         if path in self.fsMap:
    #             db = self.fsMap[path]['db']
    #     return db

    def after_query(self, res):
        """Post processor on queried results"""
        if not isinstance(res, list):
            res = [res]

        res = [replace_objid_to_str(doc) for doc in res]
        res = [flatten_dict(doc) for doc in res]
        # for doc in res:
        #     doc['sample'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['sample'])
        #     doc['_id'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['_id'])

        return res

    def get_samplelist(self, path, recursive):
        if path not in self.fsMap:
            return []

        samplelist = {}

        db_key_list = self._db_key_list(path, recursive)
        _db_list = self.client.list_database_names()
        for _path, _key in db_key_list:
            _db, _col, _fs = _key.split("::")

            if _db not in _db_list:
                continue

            _col_list = self.client[_db].collection_names()
            if _col not in _col_list:
                continue

            h = self._get_db_handler_by_key(_key)
            pipeline = [{
                "$match": {
                    "path": _path
                }
            }, {
                "$match": {
                    "sample": {
                        "$exists": True,
                        "$ne": None
                    }
                }
            }, {
                "$group": {
                    "_id": "$sample",
                    "count": {
                        "$sum": 1
                    }
                }
            }]
            res = list(h.collection.aggregate(pipeline))

            for r in res:
                _id = r['_id']
                _count = r['count']

                if _id in samplelist:
                    samplelist[_id] += _count
                else:
                    samplelist[_id] = _count

        return samplelist

    def get_samples(self, names, path, recursive):
        if path not in self.fsMap:
            return {}

        sampleData = {}
        db_key_list = self._db_key_list(path, recursive, False)
        _db_list = self.client.list_database_names()
        for _path, _key in db_key_list:
            _db, _col, _fs = _key.split("::")

            if _db not in _db_list:
                continue

            _col_list = self.client[_db].collection_names()
            if _col not in _col_list:
                continue

            h = self._get_db_handler_by_key(_key)
            for name in names:
                query = {"sample": name, "path": _path}
                res = h.load(query=query, fields={}, getarrays=False)

                if res is None:
                    continue

                res = self.after_query(res)

                if name in sampleData:
                    sampleData[name].append(res)
                else:
                    sampleData[name] = res
        return sampleData

    def get_tiff(self, id, path):
        if path not in self.fsMap:
            return []

        if self.fsMap[path]['db'] is None:
            return []

        db = self.fsMap[path]['db']
        h = self._get_db_handler(db)

        try:
            _id = ObjectId(id)
        except InvalidId:
            return []

        query = {'_id': _id, 'tiff': {'$exists': True}}
        fields = {'tiff': 1, '_id': 0}
        res = h.load(query, fields, getarrays=True)

        if res is None:
            return []

        data = res['tiff']['data']
        res['tiff']['data'] = data.tolist()
        return res['tiff']
Beispiel #2
0
    ]

    test_files = [
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.xml',
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.jpg',
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.tiff'
    ]

    print('Add data to DB')
    for dir, file in zip(data_dir, test_files):
        print(file)

        ext = os.path.splitext(file)[1][1:]

        doc = parser.run(os.path.join(dir, file),
                         kind=ext,
                         sample_name='test_sample',
                         project_name='test_project')

        if ext == 'xml':
            save_document(colCursor, doc)
        else:
            save_image_document(colCursor, fsCursor, doc, ext)

    print('Retrieve xml data')
    xml = load_xml(colCursor, 'test_sample', 'test_project')
    pp.pprint(xml)

    print('Retrieve jpg data')
    jpg = load_image(colCursor, fsCursor, "5bad2bdcd511eb8fef5ccd3f", 'jpg')
    pp.pprint(jpg)