class VideoDataset(Dataset): """ Represents the video dataset (readonly). """ def __init__(self, root, transform=None, max_mmap=1, max_gzcache=3): """ :param root: the root directory of the dataset :type root: str :param transform: the transformations to be performed on loaded data :param max_mmap: the maximum number of memory map to keep :type max_mmap: int :param max_gzcache: the maximum number of extracted memory map files to keep on disk :type max_gzcache: int """ self.root = root jsonfile = get_dset_filename_by_ext(root, '.json') with open(jsonfile) as infile: self.metainfo = json.load(infile) self.total_frames = np.sum(self.metainfo['lens']) self.lens_cumsum = list(accumulate(self.metainfo['lens'])) shape = self.metainfo['resolution'] + [self.metainfo['channels']] self.frame_shape = tuple(shape) # if self.validated_batches[i] == 0, then batch i hasn't been validated self.validated_batches = [False] * len(self.metainfo['lens']) checksumfile = get_dset_filename_by_ext(root, '.' + HASH_ALGORITHM) self.expected_hexes = parse_checksum_file(checksumfile) self.root_tmp = os.path.join(root, 'tmp') if not os.path.isdir(self.root_tmp): os.mkdir(self.root_tmp) self.transform = transform max_mmap = max(1, max_mmap) self.mmap_cache = LRUCache(maxsize=max_mmap) self.gz_cache = LRUCache(maxsize=max(max_mmap, max_gzcache)) self.max_gzcache = max(max_mmap, max_gzcache) # fine granularity lock for each data batch lockfile_tmpl = get_dset_filename_by_ext(root, '.access{}.lock') # note that I use the absolute to construct the file lock, so that the # lock will be shared by not only different processes, but also several # instances of this class, as long as they have been assigned the same # root self.access_locks = [ FileLock(lockfile_tmpl.format(bid)) for bid in range(len(self.metainfo['lens'])) ] logger = logging.getLogger(_l(__name__, self, '__init__')) logger.info('Instantiated: root={}'.format(self.root)) def __len__(self): """ :return: the number of frames in the video :rtype: int """ return self.total_frames def __getitem__(self, frame_id): """ Returns a frame of dimension HWC upon the request of a frame ID. Note that when calling this method without using contiguous or nearly contiguous indices, the efficiency will be very low. :param frame_id: the frame index :return: the frame in numpy array of dimension HWC :rtype: np.ndarray """ logger = logging.getLogger(_l(__name__, self, '__getitem__')) if frame_id < 0 or frame_id >= len(self): raise IndexError('Invalid index: {}'.format(frame_id)) batch_id, rel_frame_id = self.locate_batch(frame_id) logger.debug('Waiting for lock ID {}'.format(batch_id)) with self.access_locks[batch_id]: if batch_id not in self.mmap_cache: batchf = self.batch_filename_by_id(batch_id) if not os.path.isfile(batchf): logger.info('Decompressing "{}"'.format(batchf)) extract_gzip( self.batch_filename_by_id(batch_id, gzipped=True), batchf) assert os.path.isfile(batchf), \ '"{}" not found after decompressed' \ .format(batchf) if not self.validated_batches[batch_id]: if not check_file_integrity(batchf, self.expected_hexes[batch_id]): logger.warning( 'File ingerity failed at "{}"; retrying'.format( batchf)) # probably there's error with read last time; attempt # to decompress again for once os.remove(batchf) extract_gzip( self.batch_filename_by_id(batch_id, gzipped=True), batchf) assert os.path.isfile(batchf), \ '"{}" not found after decompressed' \ .format(batchf) if not check_file_integrity( batchf, self.expected_hexes[batch_id]): logger.error('File integrity failed at "{}"; ' 'RuntimeError raised'.format(batchf)) raise RuntimeError( 'Data batch {} corrupted'.format(batch_id)) self.validated_batches[batch_id] = True logger.info( 'File integrity check completed for batch {}'.format( batch_id)) # till here file "batchf" has been available self.gz_cache[batchf] = True shape = (self.metainfo['lens'][batch_id], ) + self.frame_shape logger.debug('keys before mmap cache adjustment: {}'.format( list(self.mmap_cache.keys()))) self.mmap_cache[batch_id] = np.memmap( str(batchf), mode='r', dtype=self.metainfo['dtype'], shape=shape) logger.debug('keys after mmap cache adjustment: {}'.format( list(self.mmap_cache.keys()))) frame = np.copy(self.mmap_cache[batch_id][rel_frame_id]) if self.transform is not None: frame = self.transform(frame) self.cleanup_unused_mmapfiles() return frame def __iter__(self): for i in range(len(self)): yield self[i] def cleanup_unused_mmapfiles(self): logger = logging.getLogger( _l(__name__, self, 'cleanup_unused_mmapfiles')) for filename in os.listdir(self.root_tmp): matched = DATABATCH_FILENAME_PAT.match(filename) if matched: batch_id = int(matched.group(1)) with self.access_locks[batch_id]: batchf = os.path.join(self.root_tmp, filename) # Since len(self.gz_cache) >= len(self.mmap_cahce) and they # are updated together, the latter must be a subset of the # former. if batchf not in self.gz_cache and \ len(os.listdir(self.root_tmp)) > self.max_gzcache: try: os.remove(batchf) except OSError: # due to concurrency, the file may have already been # removed; due to the lock, however, no process will # try to remove a file when another process is # removing exactly the same file pass else: logger.info( 'Decompressed batch "{}" removed'.format( batchf)) def cleanup_all_mmapfiles(self): """ Be sure to call this function only if there's no opened memory-mapped file. Usually this function is unnecessary unless the user want to save some disk space. """ logger = logging.getLogger(_l(__name__, self, 'cleanup_all_mmapfiles')) if os.path.isdir(self.root_tmp): shutil.rmtree(self.root_tmp) if not os.path.isdir(self.root_tmp): os.mkdir(self.root_tmp) logger.info('All decompressed batches removed') def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.release_mmap() def release_mmap(self): """ Release all memory mapped dataset. """ logger = logging.getLogger(_l(__name__, self, 'release_mmap')) keys = list(self.mmap_cache.keys()) for k in keys: del self.mmap_cache[k] logger.info('All mmap released') def locate_batch(self, frame_id): """ Locate the data batch the specified frame is stored. :param frame_id: the frame ID :type frame_id: int :return: the batch ID and the relative frame ID :rtype: Tuple[int, int] """ batch_id = bisect.bisect_left(self.lens_cumsum, frame_id + 1) try: rel_frame_id = frame_id - self.lens_cumsum[batch_id] except: print('batch_id: {}'.format(batch_id)) print('cumsum: {}'.format(self.lens_cumsum)) print('frame_id+1: {}'.format(frame_id + 1)) raise return batch_id, rel_frame_id def batch_filename_by_id(self, batch_id, gzipped=False): """ Returns the data batch filename of the specified batch ID. :param batch_id: the batch ID :type batch_id: int :param gzipped: True to returns the gzipped file; else the extracted file :type gzipped: bool :return: the data batch filename :rtype: Path """ if gzipped: return os.path.join(self.root, 'data_batch_{}.gz'.format(batch_id)) else: return os.path.join(self.root_tmp, 'data_batch_{}'.format(batch_id))
class Search: def __init__(self, cfg): self.cfg = cfg self.stopwords = set(nltk.corpus.stopwords.words('english')) self.index = [] self.stemmer = nltk.stem.PorterStemmer() self.searched_results = LRUCache( self.cfg['SEARCH_CACHE_SIZE']) # cached dictionary def search(self, q, query_type = None, ): """Search by query pageNum should start from 1 Arguments: :param q: query. dictionary. keys are "keyword", "pageNum", "range"(years), "category" Keyword Arguments: query_type {[type]} -- [description] (default: {None}) Returns: result -- a generator of result list """ if self.cfg['DEBUG_PRINT']: print("Query:", q) print("Cache:", list(self.searched_results.keys())) words = preprocessing( self.stemmer, q['keyword'], self.stopwords) key = ' '.join(words) + str(q['range']) + str(q['category']) if key not in self.searched_results: # TODO see if prepro is used correctly ori_pls = [get_posting_list(w) for w in words] total_docs = get_doc_numbers() df = [p.get_doc_freq() for p in ori_pls] pls: List[List[PostingElement]] = [ p.get_postings(q['range']) for p in ori_pls] # DONE: Filter categories. pls should be List[List[PostingElement]] # Assume categories are abbreviation if q['category']: cats = [get_cat_tag(c.strip(',')) for c in q["category"].split( self.cfg['CAT_SPLIT_SYMB'])] cat_pls = [get_posting_list(c).get_postings( q['range']) for c in cats] cat_pl_set: Set[PostingElement] = reduce( set.union, [set(p) for p in cat_pls]) pls = self.boolean_search(pls, cat_pl_set) # if a term does not appear in a specific category, no need to search among these df = [df[i] for i, p in enumerate(pls) if p] pls = [pls[i] for i, p in enumerate(pls) if p] df = np.array(df) idf = np.log10((total_docs - df + .5) / (df + .5)) # Search the rest posting lists doc_ids = self.ranked_search(pls, idf) # split into pages i = 0 split_results = [[]] # 0 for no results, page number starts from 1 while i < len(doc_ids): end = min( i + self.cfg['SEARCH_RESULTS_PER_PAGE'], len(doc_ids)) split_results.append(doc_ids[i:end]) i += self.cfg['SEARCH_RESULTS_PER_PAGE'] self.searched_results[key] = split_results if len(self.searched_results[key]) == 1: q['pageNum'] = 0 if self.cfg['DEBUG_PRINT']: print(key, self.searched_results[key], q['pageNum']) doc_list = self.searched_results[key][q['pageNum']] if self.cfg['RUN_SERVER']: results = get_doc(doc_list) results = {k: results[k] for k in doc_list} else: results = str(self.searched_results[key][q['pageNum']]) return_dict = {"docs": results, "results": sum([len(d) for d in self.searched_results[key]])} return return_dict def boolean_search(self, candidate: List[List[PostingElement]], must_in: Set[PostingElement]) -> List[List[PostingElement]]: """Cast boolean search on candidate. Filter our those not in must_in Arguments: candidate {PostingList} -- [description] must_in {PostingList} -- Returns: List[List[PostingElement]] -- [description] """ docs_must_in = set([d.doc_id for d in must_in]) result = [] for pl in candidate: result.append([ele for ele in pl if ele.doc_id in docs_must_in]) return result def get_BM25_score(self, posting_lists: List[List[PostingElement]], doc_id_to_idx, idf): num_terms = len(posting_lists) num_docs = len(doc_id_to_idx) avg_len = get_average_word_count() doc_len = np.zeros(num_docs) for doc_id, idx in doc_id_to_idx.items(): doc_len[idx] = get_doc_word_count(doc_id) / avg_len doc_len = np.array(doc_len) k = self.cfg['BM25_COEFF'] tf_matrix = np.zeros([num_docs, num_terms]) for j, p in enumerate(posting_lists): for d in p: i = doc_id_to_idx[d.doc_id] tf_matrix[i, j] = d.get_term_freq() # DONE check whether this impl is right # expand dim doc_len_unsq = doc_len[:, None] tf_matrix_scaled = tf_matrix / (tf_matrix + .5 + k * doc_len_unsq) weights = tf_matrix_scaled * idf weights = weights.sum(axis = 1) return weights def ranked_search(self, posting_lists: List[List[PostingElement]], idf, method = 'BM25') -> List[str]: ''' :param posting_lists: :return: tuple of (score, doc) ''' # get all doc ids from postings # get scores for each document # sort all_doc_ids = sorted( list(set([d.doc_id for p in posting_lists for d in p]))) doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(all_doc_ids)} doc_idx_to_id = {i: doc_id for i, doc_id in enumerate(all_doc_ids)} if doc_id_to_idx: if method == 'BM25': doc_score = self.get_BM25_score( posting_lists, doc_id_to_idx, idf) else: doc_score = self.get_BM25_score( posting_lists, doc_id_to_idx, idf) keep = min(len(all_doc_ids), self.cfg["SEARCH_RESULTS_KEEP"]) order = np.argsort(doc_score)[::-1][:keep] order = order.tolist() else: order = [] doc_id_ordered = [doc_idx_to_id[d_id] for d_id in order] return doc_id_ordered
pool_ids = set() event_ids = set() entry_ids = set() remote_ids = set() user_ids = set() for entry in entries: if 'userId' in entry and bot_ids.get(entry['userId']) is None: pool_ids.add(str(entry.get('poolId'))) event_ids.add(str(entry.get('eventId'))) entry_ids.add(str(entry.get('_id'))) user_ids.add(entry.get('userId')) missing_pool_ids = [ objectid.ObjectId(p) for p in list(pool_ids.difference(set(all_pool_ids.keys()))) ] missing_event_ids = [ objectid.ObjectId(e) for e in list(event_ids.difference(set(all_event_ids.keys()))) ] if len(missing_pool_ids) > 0: for pool in pools_coll.find({'_id': {"$in": missing_pool_ids}}): all_pool_ids[str(pool.get("_id"))] = pool if len(missing_event_ids) > 0: for event in events_coll.find({'_id': {"$in": missing_event_ids}}): all_event_ids[str(event.get("_id"))] = event if event.get("discipline") == "HR" and "remoteId" in event: all_remote_ids[event["remoteId"]] = event.get("metaData") else:
class SynchronizerComponent: def __init__(self, cache_size): self.block_cache = LRUCache(maxsize=cache_size) self.user_vlob_cache = LRUCache(maxsize=cache_size) self.vlob_cache = LRUCache(maxsize=cache_size) self.blocks = {} self.vlobs = {} self.user_vlob = None self.synchronization_idle_interval = 1 self.synchronization_task = None self.last_modified = arrow.utcnow() async def startup(self, app): self.synchronization_task = asyncio.ensure_future(self.periodic_synchronization(app)) async def shutdown(self, app): if self.synchronization_task: self.synchronization_task.cancel() self.synchronization_task = None @do def perform_block_create(self, intent): self.last_modified = arrow.utcnow() block_id = uuid4().hex self.blocks[block_id] = {'id': block_id, 'content': intent.content} return block_id @do def perform_block_read(self, intent): try: return self.blocks[intent.id] except KeyError: try: return self.block_cache[intent.id] except KeyError: try: block = yield Effect(EBackendBlockRead(intent.id)) block = {'id': block.id, 'content': block.content} except (BlockNotFound, BlockError): raise BlockNotFound('Block not found.') try: self.block_cache[intent.id] = block except ValueError: pass # Value too large if cache is disabled return block @do def perform_block_delete(self, intent): self.last_modified = arrow.utcnow() try: del self.blocks[intent.id] except KeyError: try: del self.block_cache[intent.id] except KeyError: raise BlockNotFound('Block not found.') @do def perform_block_list(self, intent): return sorted([block_id for block_id in list(self.blocks.keys())]) @do def perform_block_synchronize(self, intent): if intent.id in self.blocks: block = self.blocks[intent.id] yield Effect(EBackendBlockCreate(intent.id, block['content'])) try: self.block_cache[intent.id] = block except ValueError: pass # Value too large if cache is disabled del self.blocks[intent.id] return True return False @do def perform_user_vlob_read(self, intent): if self.user_vlob and (not intent.version or intent.version == self.user_vlob['version']): return self.user_vlob else: try: return self.user_vlob_cache[intent.version] except KeyError: user_vlob = yield Effect(EBackendUserVlobRead(intent.version)) user_vlob = {'blob': user_vlob.blob.decode(), 'version': user_vlob.version} try: self.user_vlob_cache[user_vlob['version']] = user_vlob except ValueError: pass # Value too large if cache is disabled return user_vlob @do def perform_user_vlob_update(self, intent): self.last_modified = arrow.utcnow() self.user_vlob = {'blob': intent.blob, 'version': intent.version} @do def perform_user_vlob_delete(self, intent): self.last_modified = arrow.utcnow() if self.user_vlob and (not intent.version or intent.version == self.user_vlob['version']): self.user_vlob = None else: try: del self.user_vlob_cache[intent.version] except KeyError: raise UserVlobNotFound('User vlob not found.') @do def perform_user_vlob_exist(self, intent): return self.user_vlob is not None @do def perform_user_vlob_synchronize(self, intent): if self.user_vlob: yield Effect(EBackendUserVlobUpdate(self.user_vlob['version'], self.user_vlob['blob'].encode())) try: self.user_vlob_cache[self.user_vlob['version']] = self.user_vlob except ValueError: pass # Value too large if cache is disabled self.user_vlob = None return True return False @do def perform_vlob_create(self, intent): self.last_modified = arrow.utcnow() vlob_id = uuid4().hex self.vlobs[vlob_id] = {'id': vlob_id, 'read_trust_seed': '42', 'write_trust_seed': '42', 'version': 1, 'blob': intent.blob} return {'id': vlob_id, 'read_trust_seed': '42', 'write_trust_seed': '42'} @do def perform_vlob_read(self, intent): if (intent.id in self.vlobs and (not intent.version or intent.version == self.vlobs[intent.id]['version'])): # TDOO: remove this mystic 42 if self.vlobs[intent.id]['read_trust_seed'] == '42': self.vlobs[intent.id]['read_trust_seed'] = intent.trust_seed assert intent.trust_seed == self.vlobs[intent.id]['read_trust_seed'] return {'id': intent.id, 'blob': self.vlobs[intent.id]['blob'], 'version': self.vlobs[intent.id]['version']} else: if intent.version is not None: try: cached_vlob = self.vlob_cache[(intent.id, intent.version)] assert intent.trust_seed == cached_vlob['read_trust_seed'] vlob = {'id': intent.id, 'blob': cached_vlob['blob'], 'version': intent.version} return vlob except KeyError: pass # cache miss vlob = yield Effect(EBackendVlobRead(intent.id, intent.trust_seed, intent.version)) vlob = {'id': vlob.id, 'blob': vlob.blob.decode(), 'version': vlob.version} try: cached_vlob = {'id': intent.id, 'read_trust_seed': intent.trust_seed, 'version': intent.version, 'blob': vlob['blob']} self.vlob_cache[(intent.id, intent.version)] = cached_vlob except ValueError: pass # Value too large if cache is disabled return vlob @do def perform_vlob_update(self, intent): self.last_modified = arrow.utcnow() self.vlobs[intent.id] = {'id': intent.id, 'read_trust_seed': '42', 'write_trust_seed': intent.trust_seed, 'version': intent.version, 'blob': intent.blob} @do def perform_vlob_delete(self, intent): self.last_modified = arrow.utcnow() if (intent.id in self.vlobs and (not intent.version or intent.version == self.vlobs[intent.id]['version'])): del self.vlobs[intent.id] else: try: del self.vlob_cache[(intent.id, intent.version)] except KeyError: raise VlobNotFound('Vlob not found.') @do def perform_vlob_list(self, intent): return sorted(self.vlobs.keys()) @do def perform_vlob_synchronize(self, intent): if intent.id in self.vlobs: vlob = self.vlobs[intent.id] try: self.vlob_cache[(intent.id, vlob['version'])] = vlob except ValueError: pass # Value too large if cache is disabled new_vlob = None if vlob['version'] == 1: new_vlob = yield Effect(EBackendVlobCreate(vlob['blob'].encode())) new_trust_seed = new_vlob.read_trust_seed try: self.vlob_cache[(intent.id, vlob['version'])]['read_trust_seed'] = new_trust_seed except KeyError: pass else: yield Effect(EBackendVlobUpdate( intent.id, self.vlobs[intent.id]['write_trust_seed'], self.vlobs[intent.id]['version'], vlob['blob'].encode())) # TODO encode is correct? del self.vlobs[intent.id] if new_vlob: return {'id': new_vlob.id, 'read_trust_seed': new_vlob.read_trust_seed, 'write_trust_seed': new_vlob.write_trust_seed} else: return True return False @do def perform_synchronize(self, intent): # TODO dangerous method: new vlobs are not updated in manifest. Remove it? synchronization = False block_list = yield self.perform_block_list(EBlockList()) for block_id in block_list: synchronization |= yield self.perform_block_synchronize(EBlockSynchronize(block_id)) vlob_list = yield self.perform_vlob_list(EVlobList()) for vlob_id in vlob_list: new_vlob = yield self.perform_vlob_synchronize(EVlobSynchronize(vlob_id)) if new_vlob: synchronization |= True synchronization |= yield self.perform_user_vlob_synchronize(EUserVlobSynchronize()) return synchronization @do def perform_cache_clean(self, intent): for item in list(self.block_cache.keys()): del self.block_cache[item] for item in list(self.user_vlob_cache.keys()): del self.user_vlob_cache[item] for item in list(self.vlob_cache.keys()): del self.vlob_cache[item] async def periodic_synchronization(self, app): # TODO: find a better way to do this than using asyncio_perform... while True: await asyncio.sleep(self.synchronization_idle_interval) if (arrow.utcnow().timestamp - self.last_modified.timestamp > self.synchronization_idle_interval): await asyncio_perform( app.components.get_dispatcher(), Effect(fs.ESynchronize())) def get_dispatcher(self): return TypeDispatcher({ EBlockCreate: self.perform_block_create, EBlockRead: self.perform_block_read, EBlockDelete: self.perform_block_delete, EBlockList: self.perform_block_list, EBlockSynchronize: self.perform_block_synchronize, EUserVlobRead: self.perform_user_vlob_read, EUserVlobUpdate: self.perform_user_vlob_update, EUserVlobDelete: self.perform_user_vlob_delete, EUserVlobExist: self.perform_user_vlob_exist, EUserVlobSynchronize: self.perform_user_vlob_synchronize, EVlobCreate: self.perform_vlob_create, EVlobRead: self.perform_vlob_read, EVlobUpdate: self.perform_vlob_update, EVlobDelete: self.perform_vlob_delete, EVlobList: self.perform_vlob_list, EVlobSynchronize: self.perform_vlob_synchronize, ESynchronize: self.perform_synchronize })