def elasticsearch_status(request): client = get_es_client() # get index snapshots response = requests.get("http://{0}:{1}/_snapshot/{2}/_all".format( settings.ELASTICSEARCH_SERVICE_HOSTNAME, settings.ELASTICSEARCH_PORT, "callsets")) snapshots = json.loads(response.content) index_snapshot_states = defaultdict(list) for snapshot in snapshots["snapshots"]: for index_name in snapshot["indices"]: index_snapshot_states[index_name].append(snapshot["state"]) # get indices indices = [] for index in client.cat.indices(format="json", h="*"): index_name = index['index'] # skip special indices if index_name in ['.kibana', 'index_operations_log']: continue index_json = {k.replace('.', '_'): v for k, v in index.items()} index_name = re.sub("_[0-9]{1,2}$", "", index_name) sample = Sample.objects.filter(elasticsearch_index=index_name).select_related('individual__family__project').first() if sample: project = sample.individual.family.project index_json['project_guid'] = project.guid index_json['project_id'] = project.deprecated_project_id index_json['dataset_type'] = sample.sample_type index_json['genome_version'] = project.genome_version index_json['dataset_file_path'] = sample.dataset_file_path if index_name in index_snapshot_states: index_json['snapshots'] = ", ".join(set(index_snapshot_states[index_name])) indices.append(index_json) # get operations log s = elasticsearch_dsl.Search(using=client, index=OPERATIONS_LOG) s = s.params(size=5000) operations = [doc.to_dict() for doc in s.execute().hits] #making a new list since dots in es client keys are confusing template disk_status=[] for disk in client.cat.allocation(format="json"): disk_json = {k.replace('.', '_'): v for k, v in disk.items()} disk_status.append({ 'node_name': disk_json['node'], 'disk_available': disk_json['disk_avail'], 'disk_used': disk_json['disk_used'], 'disk_percent_used': disk_json['disk_percent'], }) return render(request, "staff/elasticsearch_status.html", { 'indices': indices, 'operations': operations, 'disk_stats': disk_status, 'elasticsearch_host': settings.ELASTICSEARCH_SERVER, })
def get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() index_metadata = get_index_metadata(elasticsearch_index, es_client).get(elasticsearch_index) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets], index_metadata
def handle(self, *args, **options): client = get_es_client() indices = [index['index'] for index in client.cat.indices(format="json", h='index') if index['index'] not in ['.kibana', 'index_operations_log']] mappings = Index('_all', using=client).get_mapping(doc_type='variant') new_search_indices = {index_name for index_name in indices if 'samples_num_alt_1' in mappings[index_name]['mappings']['variant']['properties']} latest_loaded_samples = get_latest_loaded_samples() project_ids_with_new_search = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): if index_name in new_search_indices: project_ids_with_new_search.add(sample.individual.family.project_id) Project.objects.filter(id__in=project_ids_with_new_search).update(has_new_search=True) logger.info('Set new search enabled for {} projects'.format(len(project_ids_with_new_search)))
def _get_elasticsearch_index_samples(elasticsearch_index): sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=get_es_client()) try: field_mapping = index.get_field_mapping(fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([key.split(sample_field_suffix)[0] for key in index.get('mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()]) if not samples: raise Exception('No sample fields found for index "{}"'.format(elasticsearch_index)) return samples
def _get_elasticsearch_index_samples(elasticsearch_index, project): sample_field_suffix = '_num_alt' es_client = get_es_client(timeout=30) index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix), 'join_field'], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) # Nested genotypes if field_mapping.get(elasticsearch_index, {}).get('mappings', {}).get(VARIANT_DOC_TYPE, {}).get('join_field'): max_samples = Individual.objects.filter( family__project=project).count() s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='sample_id', size=max_samples)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def _get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() # Nested genotypes if is_nested_genotype_index(elasticsearch_index): s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping[ 'properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related( 'individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate( max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[ s.individual.guid]: mongo_project_samples[s.individual.family.project].add( s.dataset_file_path) mongo_projects = [{ 'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths } for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if all(not index['index'].startswith(omit_prefix) for omit_prefix in ['.', 'index_operations_log'])] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') active_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, is_active=True, elasticsearch_index__isnull=False, ).prefetch_related('individual', 'individual__family') prefetch_related_objects(active_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in active_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'errors': errors, })
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields))] index_fields = ['index', 'docs.count', 'store.size', 'creation.date.string'] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add(sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add(sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping['properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop(index_prefix).keys() index['projects'] = [{'projectGuid': project.guid, 'projectName': project.name} for project in projects_for_index] errors = ['{} does not exist and is used by project(s) {}'.format( index, ', '.join(['{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items()]) ) for index, project_individuals in seqr_index_projects.items() if project_individuals] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related('individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate(max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[s.individual.guid]: mongo_project_samples[s.individual.family.project].add(s.dataset_file_path) mongo_projects = [{'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths} for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })