def buildSearchIndices(project_number, skip_localizations=False): """ Builds search index for all data. """ # Create indices logger.info("Building index...") TatorSearch().create_index(project_number) # Create mappings logger.info("Building mappings...") for attribute_type in progressbar(list(AttributeTypeBase.objects.filter(project=project_number))): TatorSearch().create_mapping(attribute_type) # Create media documents logger.info("Building media documents...") for entity in progressbar(list(EntityMediaBase.objects.filter(project=project_number))): TatorSearch().create_document(entity) # Create localization documents if skip_localizations: logger.info("Skipping localization documents...") else: logger.info("Building localization documents...") for entity in progressbar(list(EntityLocalizationBase.objects.filter(project=project_number))): TatorSearch().create_document(entity) # Create state documents logger.info("Building state documents...") for entity in progressbar(list(EntityState.objects.filter(project=project_number))): TatorSearch().create_document(entity) # Create treeleaf documents logger.info("Building tree leaf documents...") for entity in progressbar(list(TreeLeaf.objects.filter(project=project_number))): TatorSearch().create_document(entity)
def buildSearchIndices(project_number, section, mode='index'): """ Builds search index for a project. section must be one of: 'index' - create the index for the project if it does not exist 'mappings' - create mappings for the project if they do not exist 'media' - create documents for media 'states' - create documents for states 'localizations' - create documents for localizations 'treeleaves' - create documents for treeleaves """ project_name = Project.objects.get(pk=project_number).name logger.info( f"Building search indices for project {project_number}: {project_name}" ) if section == 'index': # Create indices logger.info("Building index...") TatorSearch().create_index(project_number) logger.info("Build index complete!") return if section == 'mappings': # Create mappings logger.info("Building mappings for media types...") for type_ in progressbar( list(MediaType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for localization types...") for type_ in progressbar( list(LocalizationType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for state types...") for type_ in progressbar( list(StateType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for leaf types...") for type_ in progressbar( list(LeafType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Build mappings complete!") return class DeferredCall: def __init__(self, qs): self._qs = qs def __call__(self): for entity in self._qs.iterator(): for doc in TatorSearch().build_document(entity, mode): yield doc if section == 'media': # Create media documents logger.info("Building media documents...") qs = Media.objects.filter(project=project_number) if section == 'localizations': # Create localization documents logger.info("Building localization documents") qs = Localization.objects.filter(project=project_number) if section == 'states': # Create state documents logger.info("Building state documents...") qs = State.objects.filter(project=project_number) if section == 'treeleaves': # Create treeleaf documents logger.info("Building tree leaf documents...") qs = Leaf.objects.filter(project=project_number) batch_size = 500 count = 0 bar = ProgressBar(redirect_stderr=True, redirect_stdout=True) dc = DeferredCall(qs) total = qs.count() bar.start(max_value=total) for ok, result in streaming_bulk(TatorSearch().es, dc(), chunk_size=batch_size, raise_on_error=False): action, result = result.popitem() if not ok: print(f"Failed to {action} document! {result}") bar.update(min(count, total)) count += 1 if count > total: print(f"Count exceeds list size by {total - count}") bar.finish()
def __call__(self): for entity in self._qs.iterator(): for doc in TatorSearch().build_document(entity, mode): yield doc
def buildSearchIndices(project_number, section, mode='index', chunk=None, max_age_days=None): """ Builds search index for a project. section must be one of: 'index' - create the index for the project if it does not exist 'mappings' - create mappings for the project if they do not exist 'media' - create documents for media 'states' - create documents for states 'localizations' - create documents for localizations 'treeleaves' - create documents for treeleaves 'files' - create documents for files """ project_name = Project.objects.get(pk=project_number).name logger.info( f"Building search indices for project {project_number}: {project_name}" ) if section == 'index': # Create indices logger.info("Building index...") TatorSearch().create_index(project_number) logger.info("Build index complete!") return if section == 'mappings': # Create mappings logger.info("Building mappings for media types...") for type_ in progressbar( list(MediaType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for localization types...") for type_ in progressbar( list(LocalizationType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for state types...") for type_ in progressbar( list(StateType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for leaf types...") for type_ in progressbar( list(LeafType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Building mappings for file types...") for type_ in progressbar( list(FileType.objects.filter(project=project_number))): TatorSearch().create_mapping(type_) logger.info("Build mappings complete!") return class DeferredCall: def __init__(self, qs): self._qs = qs def __call__(self): for entity in self._qs.iterator(): if not entity.deleted: for doc in TatorSearch().build_document(entity, mode): yield doc # Get queryset based on selected section. logger.info(f"Building documents for {section}...") qs = CLASS_MAPPING[section].objects.filter(project=project_number, meta__isnull=False) # Apply max age filter. if max_age_days: min_modified = datetime.datetime.now() - datetime.timedelta( days=max_age_days) qs = qs.filter(modified_datetime__gte=min_modified) # Apply limit/offset if chunk parameter given. if chunk is not None: offset = INDEX_CHUNK_SIZE * chunk qs = qs.order_by('id')[offset:offset + INDEX_CHUNK_SIZE] batch_size = 500 count = 0 bar = ProgressBar(redirect_stderr=True, redirect_stdout=True) dc = DeferredCall(qs) total = qs.count() bar.start(max_value=total) for ok, result in streaming_bulk(TatorSearch().es, dc(), chunk_size=batch_size, raise_on_error=False): action, result = result.popitem() if not ok: print(f"Failed to {action} document! {result}") bar.update(min(count, total)) count += 1 if count > total: print(f"Count exceeds list size by {total - count}") bar.finish()