def _get_elasticsearch_index_samples(elasticsearch_index, project): sample_field_suffix = '_num_alt' es_client = get_es_client(timeout=30) index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix), 'join_field'], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) # Nested genotypes if field_mapping.get(elasticsearch_index, {}).get('mappings', {}).get(VARIANT_DOC_TYPE, {}).get('join_field'): max_samples = Individual.objects.filter( family__project=project).count() s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='sample_id', size=max_samples)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def check_data(request): if model.objects.all().count() < 3000: return HttpResponse("Too few monumenten data in the database", content_type="text/plain", status=500) # check elastic try: client = elasticsearch.Elasticsearch(settings.ELASTIC_SEARCH_HOSTS) x = elasticsearch_dsl.Search().using(client).index( settings.ELASTIC_INDICES['MONUMENTEN']).query( "match_all").execute() assert x.hits.total > 3000 except elasticsearch.TransportError: log.exception("Too few monumenten data in ES database") return HttpResponse("Autocomplete failed", content_type="text/plain", status=500) return HttpResponse("Data OK", content_type='text/plain', status=200)
def _run(self, thing): github_client = self.bot.clients.github_client elastic_client = self.bot.clients.elastic_client replier = functools.partial(self.message.reply_text, threaded=True, prefixed=False) replier("Initiating scan for `%s`." % thing) to_send = {} for index, query_tpl in self.index_and_query: query = query_tpl % {'thing': thing} replier("Scanning index `%s` using query `%s`." % (index, query)) s = (e_dsl.Search(using=elastic_client).query( "query_string", query=query).sort("-@timestamp").index(index)) s_buf = six.StringIO() for i, h in enumerate(s.scan()): h_header = "Hit %s" % (i + 1) h_header_delim = "-" * len(h_header) h_header += "\n" h_header += h_header_delim h_header += "\n" s_buf.write(h_header) s_buf.write(_format_hit(h)) s_buf.write("\n") # Github has upper limit on postings to 1MB s_buf = self._chop(s_buf, units.Mi) if s_buf: # Because github... s_buf_name = re.sub(r"\.|\-|\*|_", "", index) s_buf_name = s_buf_name + ".txt" to_send[s_buf_name] = ghe.InputFileContent(s_buf) if not to_send: replier("No scan results found.") else: replier("Uploading %s scan results to gist." % len(to_send)) me = github_client.get_user() gist = me.create_gist(True, to_send) replier("Gist url at: %s" % gist.html_url)
def reindex_test_dataset(query: dict = None, from_index: Optional[str] = None, hex_size=20) -> str: """ Reindexes the master test dataset into isolated pieces. :param from_index: Index from which to reindex. :param query: Query you want to limit the reindex to. :param hex_size: How many random characters should there be in the new indexes name. :return: Name of the newly generated index. """ from texta_elastic.core import ElasticCore from toolkit.test_settings import TEST_INDEX from_index = from_index if from_index else TEST_INDEX ec = ElasticCore() new_test_index_name = f"ttk_test_{uuid.uuid4().hex[:hex_size]}" ec.create_index(index=new_test_index_name) ec.add_texta_facts_mapping(new_test_index_name) from_scan = elasticsearch_dsl.Search() if query is None else elasticsearch_dsl.Search.from_dict(query) from_scan = from_scan.index(from_index).using(ec.es) from_scan = from_scan.scan() def doc_actions(generator): for document in generator: yield { "_index": new_test_index_name, "_type": "_doc", "_source": document.to_dict(), "retry_on_conflict": 3 } actions = doc_actions(from_scan) from elasticsearch.helpers import bulk bulk(actions=actions, client=ec.es, refresh="wait_for") return new_test_index_name
def _get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() # Nested genotypes if is_nested_genotype_index(elasticsearch_index): s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def post(self, request, project_pk: int): try: serializer = ExportSearcherResultsSerializer(data=request.data) model = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, model) serializer.is_valid(raise_exception=True) # Use the query as a hash to avoid creating duplicate files. query = serializer.validated_data["query"] query_str = json.dumps(query, sort_keys=True, ensure_ascii=False) indices = model.get_available_or_all_project_indices(serializer.validated_data["indices"]) indices = ",".join(indices) fields = serializer.validated_data["fields"] original_query = elasticsearch_dsl.Search().from_dict(query).source(fields) with_es = original_query.using(ElasticCore().es) index_limitation = with_es.index(indices) limit_by_n_docs = index_limitation.extra(size=10000) path = pathlib.Path(RELATIVE_PROJECT_DATA_PATH) / str(project_pk) / SEARCHER_FOLDER_KEY path.mkdir(parents=True, exist_ok=True) file_name = f"{hash_string(query_str)}.jl" with open(path / file_name, "w+", encoding="utf8") as fp: for item in limit_by_n_docs.scan(): item = item.to_dict() json_string = json.dumps(item, ensure_ascii=False) fp.write(f"{json_string}\n") url = reverse("protected_serve", kwargs={"project_id": int(project_pk), "application": SEARCHER_FOLDER_KEY, "file_name": file_name}) return Response(request.build_absolute_uri(url)) except elasticsearch.exceptions.RequestError: return Response({"detail": "Could not parse the query you sent!"}, status=status.HTTP_400_BAD_REQUEST)
def main(): db = { 'es_hosts' : ["es_host"], 'dbname' : 'logstash_index_name' } client = es.Elasticsearch(db['es_hosts']) rts = 0 ts = 0 with open('missing_ids.txt', 'r') as f: for line in f: s = dsl.Search(using=client, index=db['dbname']) s = s.query('match', id_str=line) for hit in s.execute(): if 'retweeted_status' in hit: rts += 1 else: ts += 1 print('TWEETS: ' + str(ts) + ' RTs: ' + str(rts), end='\r') print('TWEETS : ', ts ) print('RETWEETS : ', rts)
import os import sys if '__file__' in vars(): project_path = os.path.abspath( os.path.join(__file__, os.path.pardir, os.path.pardir, os.path.pardir)) print('\n Adding path: ', project_path) sys.path.append(project_path) # Own code from config import * # Connection to Elasticsearch con = es.Elasticsearch('localhost') search_content = es_dsl.Search(using=con, index='netgear') max_count = search_content.count() res_content = search_content[0:max_count].execute() # Content of netgear res_filtered = [x['_source'].to_dict() for x in res_content['hits']['hits']] A = pd.DataFrame.from_dict(res_filtered) # Connection to MySQL connection = pymysql.connect(host='localhost', user=MYSQL_USER, password=os.environ['seb_mysql_key'], db=DB_NAME_NETGEAR) cursor = connection.cursor() cursor.execute('describe netgear')
from elasticsearch.helpers import bulk from datetime import date, timedelta batchFileName = 'UpdateWeather.json' es_host = 'elastic00:9200' srchDate = date(2018, 2, 5) oneDay = timedelta(days=1) srchDate1 = srchDate + oneDay serial = 0 numDay = srchDate.year * 10000 + srchDate.month * 100 + srchDate.day numDay = numDay * 1000000 esconn = dsl.connections.create_connection(hosts=es_host, timeout=5) batchFile = open(batchFileName, 'w') s = dsl.Search(index='weather-*').query("range", time={"gte": srchDate, "lt": srchDate1})\ .sort('time')[0:25000] batchList = [] for h in s.execute().hits: # print(h.meta.id) serial += 1 newTsa = numDay + serial item = { '_index': h.meta.index, '_op_type': 'update', '_type': 'doc', '_id': h.meta.id, 'doc': { 'tsa': newTsa }
def search_with_query(self, query: EsQuery): logger.info("search_with_query called with query={}".format(query)) if query.split_results: ms = es_dsl.MultiSearch(using=self.es) for resource in query.resources: s = es_dsl.Search(index=resource) if query.query is not None: s = s.query(query.query) s = s[query.from_ : query.from_ + query.size] if query.sort: s = s.sort(*self.translate_sort_fields([resource], query.sort)) elif resource in query.sort_dict: s = s.sort( *self.translate_sort_fields( [resource], query.sort_dict[resource] ) ) ms = ms.add(s) responses = ms.execute() result = {"total": 0, "hits": {}} for i, response in enumerate(responses): result["hits"][query.resources[i]] = self._format_result( query.resources, response ).get("hits", []) result["total"] += response.hits.total if query.lexicon_stats: if "distribution" not in result: result["distribution"] = {} result["distribution"][query.resources[i]] = response.hits.total return result else: s = es_dsl.Search(using=self.es, index=query.resource_str) if query.query is not None: s = s.query(query.query) # s = s[query.from_ : query.from_ + query.size] if query.lexicon_stats: s.aggs.bucket( "distribution", "terms", field="_index", size=len(query.resources) ) if query.sort: s = s.sort(*self.translate_sort_fields(query.resources, query.sort)) elif query.sort_dict: sort_fields = [] for resource, sort in query.sort_dict.items(): sort_fields.extend(self.translate_sort_fields([resource], sort)) s = s.sort(*sort_fields) logger.debug("s = {}".format(s.to_dict())) response = self.execute_query(s, from_=query.from_, size=query.size) # TODO format response in a better way, because the whole response takes up too much space in the logs # logger.debug('response = {}'.format(response.to_dict())) # print(f"{response=}") result = self._format_result_dict(query.resources, response) if query.lexicon_stats: if "aggregations" not in response: response = self.execute_query(s, from_=0, size=0) result["distribution"] = {} for bucket in response["aggregations"]["distribution"]["buckets"]: key = bucket["key"] value = bucket["doc_count"] result["distribution"][key.rsplit("_", 1)[0]] = value return result
def elastic_search(request, resourcetype='base'): parameters = request.GET es = Elasticsearch(settings.ES_URL) # exclude the profile and group indexes. # They aren't being used, and cause issues with faceting indices = es.indices.get_alias("*").keys() exclude_indexes = ['profile-index', 'group-index'] [indices.remove(i) for i in exclude_indexes if i in indices] search = elasticsearch_dsl.Search(using=es, index=indices) search = get_base_query(search) search = apply_base_filter(request, search) # Add facets to search for fn in get_facet_fields(): search.aggs.bucket( fn, 'terms', field=fn, order={"_count": "desc"}, size=parameters.get("nfacets", 15) ) # run search only filtered by what a particular user is able to see # this makes sure to get every item that is possible in the facets # in order for a UI to build the choices overall_results = search[0:0].execute() facet_results = get_facet_results(overall_results.aggregations, parameters) search = filter_by_resource_type(search, resourcetype) search = get_main_query(search, parameters.get('q', None)) # Add the facet queries to the main search for fq in get_facet_filter(parameters): search = search.query(fq) # Add in has_time filter if set if parameters.get("has_time", False): search = search.query(Q({'match': {'has_time': True}})) search = add_bbox_search(search, parameters.get("extent", None)) search = add_temporal_search(search, parameters) search = apply_sort(search, parameters.get("order_by", "relevance")) limit = int(parameters.get('limit', settings.API_LIMIT_PER_PAGE)) offset = int(parameters.get('offset', '0')) # Run the search using the offset and limit search = search[offset:offset + limit] results = search.execute() logger.debug('search: {}, results: {}'.format(search, results)) filtered_facet_results = filter_results_by_facets( results.aggregations, facet_results ) # Get results objects = get_unified_search_result_objects(results.hits.hits) object_list = { "meta": { "limit": limit, "next": None, "offset": offset, "previous": None, "total_count": results.hits.total, "facets": filtered_facet_results, }, "objects": objects, } return JsonResponse(object_list)
"""Simple script to dump some database raw data into a file for use in tests.""" import json from typing import Any, Dict, List from elasticsearch import Elasticsearch import elasticsearch_dsl as es from ingress.utils import setup_mappings setup_mappings('tweets-brexit-remain-leave', 'localhost:9200') client = Elasticsearch() search = es.Search(using=client) geotagged_records = search.query('match', geotagged=True)[0:10].execute() untagged_records = search.query('match', geotagged=False)[0:10].execute() combined_records: List[Dict[str, Any]] = [] combined_records.extend(geotagged_records.hits) combined_records.extend(untagged_records.hits) geotagged_data = [] for hit in combined_records: record = {} for attr in dir(hit): attr_data = getattr(hit, attr) if isinstance(attr_data, es.AttrDict): record[attr] = attr_data.to_dict()
## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ## ## Authors: ## Jesus M. Gonzalez-Barahona <*****@*****.**> ## ## Modified by: ## David Arroyo Menéndez <*****@*****.**> import elasticsearch import elasticsearch_dsl from pprint import pprint # ElasticSearch instance (url) es = elasticsearch.Elasticsearch(['http://localhost:9200/']) # Build a DSL Search object on the 'commits' index, 'summary' documents type request = elasticsearch_dsl.Search(using=es, index='dam-index', doc_type='item') # Run the Search, using the scan interface to get all resuls response = request.scan() for r in response: pprint(r)
def search(es_client, request): return elasticsearch_dsl.Search(using=es_client.conn, index=es_client.index)
def cdr_images_after(es_instance, index, image_types, crawled_after=None, inserted_after=None, agg_img_types=False, domain='weapons'): """ Return query and return an iterator over ES entries. Results yielded in ascending CDR insertion order (i.e. FIFO). This should cause slicing to be stable. :param es_instance: elasticsearch.Elasticsearch instance. :type es_instance: :param index: ElasticSearch index to draw from. :type index: str :param image_types: List of image content type suffixes (e.g. ['png', 'jpeg']) :type image_types: :param crawled_after: Optional timestamp to constrain query elements to only those collected/crawled after this time. :type crawled_after: datetime.datetime :param inserted_after: Optional timestamp to constrain query elements to only those inserted into the ES instance/index after this time. :param inserted_after: datetime.datetime :param agg_img_types: If we should add an aggregation on image types to the query (prevents scanning). :param domain: The _type to filter by. "weapons" by default. :return: :rtype: """ log = logging.getLogger(__name__) log.info("Forming ES CDR image query for types: %s", image_types) base_search = elasticsearch_dsl.Search()\ .using(es_instance)\ .index(index)\ .fields(['_id', '_timestamp', '_type', 'content_type', 'obj_original_url', 'obj_stored_url', 'timestamp', 'version', ]) if domain: base_search = base_search.doc_type(domain) # I think `_type` filter is redundant with `doc_type` specification above if elasticsearch_dsl.VERSION[0] == 2: # ES 2.x version f = Q('term', version='2.0') \ & Q('term', content_type='image') if image_types: f &= Q('terms', content_type=image_types) if domain: log.debug("Constraining _type: %s", domain) f &= Q('term', _type=domain) if crawled_after: log.debug("Constraining to entries crawled after: %s", crawled_after) f &= Q('range', timestamp={'gt': crawled_after}) if inserted_after: log.debug("Constraining to entries inserted after: %s", inserted_after) f &= Q('range', _timestamp={'gt': inserted_after}) else: # ES 1.x version from elasticsearch_dsl.filter import F f = F('term', version='2.0') \ & F('term', content_type='image') if image_types: f &= F('terms', content_type=image_types) if domain: log.debug("Constraining _type: %s", domain) f &= F('term', _type=domain) if crawled_after: log.debug("Constraining to entries crawled after: %s", crawled_after) f &= F('range', timestamp={'gt': crawled_after}) if inserted_after: log.debug("Constraining to entries inserted after: %s", inserted_after) f &= F('range', _timestamp={'gt': inserted_after}) q = base_search\ .filter(f)\ .sort({'_timestamp': {"order": "asc"}}) if agg_img_types: log.debug("Aggregating image content type information") q.aggs.bucket('per_type', 'terms', field='content_type') return q
'--timeout', type=int, help='data validity timeout (in minutes)', default=5) parser.add_argument('-E', '--elastichost', type=str, help='connection URL of ElasticSearch server', default="localhost:9200") args = parser.parse_args() escnx = elasticsearch_dsl.connections.create_connection( hosts=[args.elastichost], timeout=20) # construct an Elasticsearch DSL Search() object, using Q() shortcuts to build the query request = elasticsearch_dsl.Search(using=escnx, index="metricbeat-*", doc_type='_doc') request = request.query( 'bool', must=[ 'match_all', # as Q shortcut doesn't support nested keywords (eg. 'agent.type' for instance) # we must provide keywords as kwargs dict type Q('match', **{'agent.type': 'metricbeat'}), Q('match', **{'host.name': args.hostname}), Q('exists', **{'field': 'windows.perfmon.system.processor_queue_length'}) ]) # we'll output 'windows.perfmon.system.processor_queue_length' and '@timestamp' fields request = request.source( ['@timestamp', 'windows.perfmon.system.processor_queue_length'])
def evaluate_score(student, client, num_resp: int = 25): """Takes a student, represented as a dictionary and an elasticsearch-py client and returns an elastic response See above student class for schema """ # Adjust weights here: base_score = 1.0 company_score = 1.0 rural_score = 2.0 tags_score = 3.0 underrep_score = 1.0 # Timezone weights are found in the timezone script query s = elasticsearch_dsl.Search(using=client, index="mentors_index").extra(explain=True) # Start by filtering the search by track s = s.filter("term", track=student["track"]) # And also by requireExtended if student["requireExtended"]: s = s.filter("term", okExtended="true") if not student["underrepresented"]: s = s.exclude("term", preferStudentUnderRep=2) # Adds one to all remaining entries in order to be sure that, in the worst case, # there are enough responses, even if they aren't a good fit base_value = Q("constant_score", filter=MatchAll(), boost=base_score) # Uses a fuzzy query to determine if a student is interested in the mentor's company, # then if so adds `weight` to the score company_q = None for company in student["interestCompanies"]: if company_q is None: company_q = Q( "function_score", query=Q("fuzzy", company=company), weight=company_score, boost_mode="replace", ) else: company_q = company_q | Q( "function_score", query=Q("fuzzy", company=company), weight=company_score, boost_mode="replace", ) if student["rural"]: # If background_rural matches on mentor and student, then add one to the score background_rural = Q("constant_score", filter=Q("term", backgroundRural=student["rural"]), boost=rural_score) else: background_rural = Q("constant_score", filter=MatchNone()) # Adds `weight` * the number of matching tags to score tags_matching = None num_interests = len(student["interestTags"]) for interest in student["interestTags"]: if tags_matching is None: tags_matching = Q( "function_score", query=Q("term", proj_tags=interest), weight=tags_score / num_interests, boost_mode="replace", ) else: tags_matching = tags_matching | Q( "function_score", query=Q("term", proj_tags=interest), weight=tags_score / num_interests, boost_mode="replace", ) combined_query = ( base_value | tags_matching | company_q | background_rural # | prefer_student_underrep ) # Decay the combined score based on the number of students who already voted for that combined_query = Q("function_score", query=combined_query, functions=SF("gauss", numStudentsSelected={ "origin": 0, "scale": 3, "offset": 3, "decay": 0.50 })) # Timezone - this one's a bit more complex. See comments in script for more details. # Multiplies it's value by the previous scores, allowing it to reduce, set to zero, and increase scores. # See below string for python implementation """ if mentor['okTimezoneDifference']: if 16 < student['timezone'] < 22: return True return false else: if abs(student['timezone'] - mentor['timezone']) < 3: return True return False """ s = s.query(combined_query)[0:num_resp] resp = s.execute() return resp
def update_histogram_bins(self, log, log_type): logger.debug("updating " + log_type + " histogram bins") try: search = es_dsl.Search( using=self.client, index=self.prefix + '_monitor_data') \ .filter('match', _type='fields') \ .filter('match', _id='intervals') \ .extra(size=1) intervals = search.execute()[0].to_dict() fields = [ '.'.join(path.split('.')[:-1]) for path in nested_paths(intervals[log_type]) if path.endswith('interval') ] for field in fields: cur_val = nested_get(log, field) if cur_val is None: break field_path = log_type + '.' + field intervals_field = nested_get(intervals, field_path) changed = False if intervals_field['interval'] is None: intervals_field['min'] = cur_val intervals_field['max'] = cur_val changed = True else: if cur_val < intervals_field['min']: intervals_field['min'] = cur_val changed = True elif cur_val > intervals_field['max']: intervals_field['max'] = cur_val changed = True if changed: if intervals_field['min'] == intervals_field['max']: intervals_field['interval'] = 1 else: intervals_field['interval'] = \ math.ceil((intervals_field['max'] - intervals_field['min']) / 20.0) for vis_id in intervals_field['vis_ids']: search_vis = es_dsl.Search( using=self.client, index='.kibana') \ .filter('match', _id=vis_id) \ .filter('match', _type='visualization') \ .extra(size=1) vis = search_vis.execute()[0] vis_state = json.loads(vis.visState) for agg in vis_state['aggs']: if agg['type'] == 'histogram' and \ agg['params']['field'] == field_path: agg['params']['interval'] = \ intervals_field['interval'] vis.visState = json.dumps(vis_state, sort_keys=True) vis_source = json.loads( vis.kibanaSavedObjectMeta.searchSourceJSON) filter_words = vis_source['query']['query_string'][ 'query'].split(' ') for i, word in enumerate(filter_words): if word.startswith(field_path + ':>='): filter_words[i] = field_path + ':>=' + \ str(intervals_field['min']) elif word.startswith(field_path + ':<='): filter_words[i] = field_path + ':<=' + \ str(intervals_field['max']) vis_source['query']['query_string']['query'] =\ ' '.join(filter_words) vis.kibanaSavedObjectMeta.searchSourceJSON = \ json.dumps(vis_source, sort_keys=True) self.client.index(index='.kibana', doc_type='visualization', id=vis_id, body=vis.to_dict()) nested_set(intervals, log_type + '.' + field, intervals_field) self.client.index(index=self.prefix + '_monitor_data', doc_type='fields', id='intervals', body=intervals) except Exception as e: logger.error(e)
import elasticsearch import elasticsearch_dsl client = elasticsearch.Elasticsearch(['localhost']) s = elasticsearch_dsl.Search(using=client, index="salt-status_diskusage-v1") \ .query('match', minion='minion1') \ .source(['@timestamp', 'data./etc/hosts.available', 'data./etc/hosts.total']) \ .sort("-@timestamp") \ .extra(size=1) response = s.execute().to_dict() source = response['hits']['hits'][0]['_source'] data = source['data'] print('Raw /etc/hosts Disk data from returner:', data) ############################# # Get Disk Usage s = elasticsearch_dsl.Search(using=client, index="salt-disk_percent-v1") \ .query('match', minion='minion2') \ .source(['@timestamp', 'data./']) \ .sort("-@timestamp") \ .extra(size=1) response = s.execute().to_dict() source = response['hits']['hits'][0]['_source'] data = source['data'] print('/ Disk data from returner: ', data)
def get_all_paths() -> Set[str]: search = (elasticsearch_dsl.Search( using=get_session(), index=config.config['elasticsearch']['index'], doc_type=ES_DOC_TYPE).source(['path'])) return set(h.path for h in search.scan())
def count_by_city_order_by_country( vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): country_aggs_name = "country_count" city_aggs_name = "city_count" title = "count by city" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) s = s.filter( "bool", must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")]) s = s.filter( "bool", must=[elasticsearch_dsl.Q("exists", field="city.title.keyword")]) s = s.filter( "bool", must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")]) s = s.filter( "bool", must_not=[elasticsearch_dsl.Q("match", city__title__keyword="")]) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size, collect_mode="breadth_first") a1 = elasticsearch_dsl.A('terms', field="city.title.keyword", size=size) s.aggs.bucket(country_aggs_name, a).bucket(city_aggs_name, a1) response = s.execute() data_dict = {} for country_hit in response.aggregations[country_aggs_name].buckets: x_axis = [hit.key for hit in country_hit[city_aggs_name].buckets] y_axis = [hit.doc_count for hit in country_hit[city_aggs_name].buckets] if is_need_other: x_axis.append("other") y_axis.append(country_hit[city_aggs_name].sum_other_doc_count) data_dict[country_hit.key] = {} data_dict[country_hit.key]["x_axis"] = x_axis data_dict[country_hit.key]["y_axis"] = y_axis for country in data_dict: x_axis = data_dict[country]["x_axis"] y_axis = data_dict[country]["y_axis"] cur_title = f"{title}\n{country}" figname = f"{title.replace(' ', '_')}_{country}" if is_need_print: print(cur_title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(cur_title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def es_dsl_queryset(): return elasticsearch_dsl.Search()
def generator(self): """ main generator function for IDFile and IDFileConsume searching with an set of IDs can take quite long time better would be to reduce the set of documents to a pure idlist, this is quite fast over mget often, its needed to do it with a search, therefore both ways work """ missing = [] # an iterable containing missing ids while len(self.ids) > 0: if self.body: ms = elasticsearch_dsl.MultiSearch( using=self.es, index=self.index, doc_type=self.type_) # setting up MultiSearch this_iter_ids = self.ids[:self. chunksize] # an ID List per iteration, so we can check if all the IDs of this chunksize are found at the end. for _id in this_iter_ids: # add a search per ID ms = ms.add(elasticsearch_dsl.Search().source( excludes=self.source_excludes, includes=self.source_includes).from_dict( self.body).query("match", _id=_id)) responses = ms.execute() for response in responses: for hit in response: _id = hit.meta.to_dict()["id"] yield self.return_doc(hit) del self.ids[self.ids.index(_id)] del this_iter_ids[this_iter_ids.index(_id)] for _id in this_iter_ids: """ unfortunately MultiSearch doesn't throw an exception for non-Found-IDs, so we have manually check for missing ids so we again iterate over the helper_list with the IDs per chunk size (simply doing self.dis[:self.chunksize] would give us a new set) and we put all the IDs who are still in there in our missing list and delete them from self.ids and this_iter_ids """ missing.append(_id) del self.ids[self.ids.index(_id)] del this_iter_ids[this_iter_ids.index(_id)] else: try: s = elasticsearch_dsl.Document.mget( docs=self.ids[:self.chunksize], using=self.es, index=self.index, _source_excludes=self.source_excludes, _source_includes=self.source_includes, _source=self.source, missing='raise') except elasticsearch.exceptions.NotFoundError as e: for doc in e.info[ 'docs']: # we got some missing ids and harvest the missing ids from the Elasticsearch NotFoundError Exception missing.append(doc['_id']) del self.ids[self.ids.index(doc['_id'])] else: # only gets called if we don't run into an exception for hit in s: _id = hit.meta.to_dict()["id"] yield self.return_doc(hit) del self.ids[self.ids.index(_id)] if not self.ids: """ if we delete the last item from ids, ids turns to None and then the while(len(list())) would throw an exception, since None isn't an iterable """ self.ids = [] for item in self.write_file(missing): yield item
def download_templates(self): logger.info("getting Kibana objects with prefix " + self.prefix) logger.info("getting index patterns") index_dir = os.path.join(self.template_dir, 'index') try: try: os.mkdir(self.template_dir) os.mkdir(os.path.join(self.template_dir, 'index')) except OSError: pass search_index = es_dsl.Search(using=self.client, index='.kibana') \ .filter('prefix', _id=self.prefix) \ .filter('match', _type='index-pattern') \ .extra(size=10000) response_index = search_index.execute() for index in response_index: index.meta.id = index.meta.id \ .replace(self.prefix, '[template]') index.title = index.title.replace(self.prefix, '[template]') with open( os.path.join(index_dir, index.meta.id) + '.json', 'w') as f: f.write( json.dumps(index.to_dict(), indent=4, sort_keys=True)) f.write('\n') except Exception as e: logger.error(e) dash_dir = os.path.join(self.template_dir, 'dash') intervals = {} for name in self.dashboards: logger.info("getting " + name + " dashboard") vis_ids = [] try: try: os.mkdir(os.path.join(dash_dir)) except OSError: pass search_dash = es_dsl.Search( using=self.client, index='.kibana') \ .filter('match', _id=self.prefix + '-' + name) \ .filter('match', _type='dashboard') \ .extra(size=1) dash = search_dash.execute()[0] dash.meta.id = dash.meta.id \ .replace(self.prefix, '[template]') dash.title = dash.title.replace(self.prefix, '[template]') dash_panels = json.loads(dash.panelsJSON) for panel in dash_panels: vis_ids.append(panel['id']) panel['id'] = panel['id'].replace(self.prefix, '[template]') dash.panelsJSON = json.dumps(dash_panels, sort_keys=True) with open(os.path.join(dash_dir, dash.meta.id) + '.json', 'w') as f: f.write( json.dumps(dash.to_dict(), indent=4, sort_keys=True)) f.write('\n') except Exception as e: logger.error(e) logger.info("getting " + name + " visualizations") vis_dir = os.path.join(self.template_dir, 'vis') try: os.mkdir(vis_dir) except OSError: pass for vis_id in vis_ids: try: search_vis = es_dsl.Search( using=self.client, index='.kibana') \ .filter('match', _id=vis_id) \ .filter('match', _type='visualization') \ .extra(size=1) vis = search_vis.execute()[0] vis.meta.id = vis.meta.id \ .replace(self.prefix, '[template]') vis.title = vis.title \ .replace(self.prefix, '[template]') vis_state = json.loads(vis.visState) vis_state['title'] = vis['title'] if vis_state['type'] == 'markdown': vis_state['params']['markdown'] = "text goes here" else: vis_source = json.loads( vis.kibanaSavedObjectMeta.searchSourceJSON) vis_source['index'] = vis_source['index'].replace( self.prefix, '[template]') if vis_state['type'] == 'histogram': hist_aggs = [ agg for agg in vis_state['aggs'] if agg['type'] == 'histogram' ] for agg in hist_aggs: agg['params']['interval'] = 1e10 field_path = agg['params']['field'] filter_words = vis_source['query'][ 'query_string']['query'].split(' ') filter_found = False for i, word in enumerate(filter_words): if word.startswith(field_path + ':>='): filter_words[i] = \ field_path + ':>=0' filter_found = True elif word.startswith(field_path + ':<='): filter_words[i] = \ field_path + ':<=0' filter_found = True if not filter_found: if len(filter_words) > 0: filter_words += \ ['AND', field_path + ':>=0', 'AND', field_path + ':<=0'] else: filter_words = \ [field_path + ':>=0', 'AND', field_path + ':<=0'] vis_source['query']['query_string']['query'] =\ ' '.join(filter_words) vis_ids = nested_get(intervals, field_path + '.vis_ids') if vis_ids and vis.meta.id not in vis_ids: vis_ids.append(vis.meta.id) else: vis_ids = [vis.meta.id] hist_data = { 'interval': None, 'min': None, 'max': None, 'vis_ids': vis_ids } nested_set(intervals, agg['params']['field'], hist_data) elif vis_state['type'] == 'table': if vis.meta.id == '[template]-Category-summary': vis_state['params']['perPage'] = 0 aggs = [ agg for agg in vis_state['aggs'] if 'params' in agg and 'size' in agg['params'] ] for agg in aggs: agg['params']['size'] = 0 elif vis.meta.id == '[template]-Workflow-summary': vis_state['params']['perPage'] = 0 aggs = [ agg for agg in vis_state['aggs'] if 'params' in agg and 'size' in agg['params'] ] for agg in aggs: agg['params']['size'] = 0 vis.kibanaSavedObjectMeta.searchSourceJSON = \ json.dumps(vis_source, sort_keys=True) vis.visState = json.dumps(vis_state, sort_keys=True) with open( os.path.join(vis_dir, vis.meta.id) + '.json', 'w') as f: f.write( json.dumps(vis.to_dict(), indent=4, sort_keys=True)) f.write('\n') except Exception as e: logger.error(e) try: with open( os.path.join(self.template_dir, 'intervals') + '.json', 'w') as f: f.write(json.dumps(intervals, indent=4, sort_keys=True)) except Exception as e: logger.error(e)
def es_dsl_search(pyramid_request): return elasticsearch_dsl.Search(using=pyramid_request.es.conn, index=pyramid_request.es.index)
def elasticsearch_status(request): client = get_es_client() # get index snapshots response = requests.get("http://{0}:{1}/_snapshot/{2}/_all".format( settings.ELASTICSEARCH_SERVICE_HOSTNAME, settings.ELASTICSEARCH_PORT, "callsets")) snapshots = json.loads(response.content) index_snapshot_states = defaultdict(list) for snapshot in snapshots.get("snapshots", []): for index_name in snapshot.get("indices", []): index_snapshot_states[index_name].append(snapshot["state"]) # get indices indices = [] for index in client.cat.indices(format="json", h="*"): index_name = index['index'] # skip special indices if index_name in ['.kibana', 'index_operations_log']: continue index_json = {k.replace('.', '_'): v for k, v in index.items()} index_name = re.sub("_[0-9]{1,2}$", "", index_name) sample = Sample.objects.filter( elasticsearch_index=index_name).select_related( 'individual__family__project').first() if sample: project = sample.individual.family.project index_json['project_guid'] = project.guid index_json['project_id'] = project.deprecated_project_id index_json['dataset_type'] = sample.sample_type index_json['genome_version'] = project.genome_version index_json['dataset_file_path'] = sample.dataset_file_path if index_name in index_snapshot_states: index_json['snapshots'] = ", ".join( set(index_snapshot_states[index_name])) indices.append(index_json) # get operations log s = elasticsearch_dsl.Search(using=client, index=OPERATIONS_LOG) s = s.params(size=5000) operations = [doc.to_dict() for doc in s.execute().hits] #making a new list since dots in es client keys are confusing template disk_status = [] for disk in client.cat.allocation(format="json"): disk_json = {k.replace('.', '_'): v for k, v in disk.items()} disk_status.append({ 'node_name': disk_json['node'], 'disk_available': disk_json['disk_avail'], 'disk_used': disk_json['disk_used'], 'disk_percent_used': disk_json['disk_percent'], }) return render( request, "staff/elasticsearch_status.html", { 'indices': indices, 'operations': operations, 'disk_stats': disk_status, 'elasticsearch_host': settings.ELASTICSEARCH_SERVER, })
def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Individual from xbrowse_server.mall import get_reference cache_key = "Variants___%s___%s___%s" % (project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ])) cached_results = self._redis_client and self._redis_client.get( cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ] if indivs_to_consider is None: if genotype_filter: indivs_to_consider = genotype_filter.keys() else: indivs_to_consider = [] if family_id is not None: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__family_id=family_id).only("indiv_id") ] else: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__project__project_id=project_id).only("indiv_id") ] from xbrowse_server.base.models import Project, Family from pyliftover.liftover import LiftOver query_json = self._make_db_query(genotype_filter, variant_filter) try: if self.liftover_grch38_to_grch37 is None: self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19') if self.liftover_grch37_to_grch38 is None: self.liftover_grch37_to_grch38 = None # LiftOver('hg19', 'hg38') except Exception as e: logger.info( "WARNING: Unable to set up liftover. Is there a working internet connection? " + str(e)) if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if family_id is not None and len(family_individual_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] mapping = self._es_client.indices.get_mapping( str(elasticsearch_index) + "*") if family_individual_ids: indiv_id = _encode_name(family_individual_ids[0]) for index_name, index_mapping in mapping.items(): if indiv_id + "_num_alt" in index_mapping["mappings"][ "variant"]["properties"]: matching_indices.append(index_name) if not matching_indices: if not family_individual_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error( "no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"] ["properties"]))) else: logger.info("matching indices: " + str(elasticsearch_index)) elasticsearch_index = ",".join(matching_indices) s = elasticsearch_dsl.Search(using=self._es_client, index=str(elasticsearch_index) + "*") #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) if indivs_to_consider: atleast_one_nonref_genotype_filter = None for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id + "_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if quality_filter is not None and indivs_to_consider: #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 min_ab = quality_filter.get('min_ab') if min_ab is not None: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id + "_num_alt": 1}) | Q('range', ** {encoded_sample_id + "_ab": { 'gte': min_ab }})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter( 'range', **{encoded_sample_id + "_gq": { 'gte': min_gq }}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) #logger.info("### ADDED FILTER: " + str(~Q('exists', field='filters'))) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set( annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set( annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q( "terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update( ["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update([ "Likely_pathogenic", "Pathogenic/Likely_pathogenic" ]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update( ["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update( ["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other" ]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q( "terms", clinvar_clinical_significance=list( clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q( "terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q( 'exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): sample_id = ".".join(key.split(".")[1:-1]) encoded_sample_id = _encode_name(sample_id) genotype_filter = value #logger.info("==> genotype filter: " + str(genotype_filter)) if type(genotype_filter) == int or type( genotype_filter) == basestring: #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) s = s.filter( 'term', **{encoded_sample_id + "_num_alt": genotype_filter}) elif '$gte' in genotype_filter: genotype_filter = { k.replace("$", ""): v for k, v in genotype_filter.items() } s = s.filter( 'range', **{encoded_sample_id + "_num_alt": genotype_filter}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] q = Q( 'term', **{encoded_sample_id + "_num_alt": num_alt_values[0]}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_values[0]})) for num_alt_value in num_alt_values[1:]: q = q | Q( 'term', ** {encoded_sample_id + "_num_alt": num_alt_value}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_value})) s = s.filter(q) if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter[ "xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = { k.replace("$", ""): v for k, v in xpos_filters_dict.items() } q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": "AF", "db_freqs.1kg_wgs_phase3": "g1k_POPMAX_AF", "db_freqs.exac_v3": "exac_AF_POPMAX", "db_freqs.topmed": "topmed_AF", "db_freqs.gnomad_exomes": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad_genomes": "gnomad_genomes_AF_POPMAX", "db_freqs.gnomad-exomes2": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad-genomes2": "gnomad_genomes_AF_POPMAX", } if key in af_key_map: filter_key = af_key_map[key] af_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception( "This search matched too many variants. Please set additional filters and try again." ) #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(s.scan()): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] for individual_id in family_individual_ids: encoded_individual_id = _encode_name(individual_id) num_alt = int(hit["%s_num_alt" % encoded_individual_id]) if ( "%s_num_alt" % encoded_individual_id) in hit else -1 if num_alt is not None: all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': hit["%s_ab" % encoded_individual_id] if ("%s_ab" % encoded_individual_id) in hit else None, 'alleles': map(str, alleles), 'extras': { 'ad': hit["%s_ab" % encoded_individual_id] if ("%s_ad" % encoded_individual_id) in hit else None, 'dp': hit["%s_dp" % encoded_individual_id] if ("%s_dp" % encoded_individual_id) in hit else None, #'pl': '', }, 'filter': filters or "pass", 'gq': hit["%s_gq" % encoded_individual_id] if ("%s_gq" % encoded_individual_id in hit and hit["%s_gq" % encoded_individual_id] is not None) else '', 'num_alt': num_alt, } if all([num_alt <= 0 for num_alt in all_num_alt]): #logger.info("Filtered out due to genotype: " + str(genotypes)) #print("Filtered all_num_alt <= 0 - Result %s: GRCh38: %s:%s, cadd: %s %s - %s" % (i, hit["contig"], hit["start"], hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], all_num_alt)) continue vep_annotation = json.loads( str(hit['sortedTranscriptConsequences']) ) if 'sortedTranscriptConsequences' in hit else None if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s " % ( grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None if self.liftover_grch38_to_grch37: grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s " % ( grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get( hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get( hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'GERP_RS': hit["dbnsfp_GERP_RS"] if "dbnsfp_GERP_RS" in hit else None, 'phastCons100way_vertebrate': hit["dbnsfp_phastCons100way_vertebrate"] if "dbnsfp_phastCons100way_vertebrate" in hit else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or ""), 'vep_consequence': str(hit['mainTranscript_major_consequence'] or ""), 'main_transcript': { k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_') }, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': { str(hit['mainTranscript_gene_id']): 0 }, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, '1kg_AC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, '1kg_AN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_Adj_AC" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': { 'AF': float(hit["AF"] or 0.0) if "AF" in hit else None, '1kg_wgs_AF': float(hit["g1k_AF"] or 0.0) if "g1k_AF" in hit else None, '1kg_wgs_popmax_AF': float(hit["g1k_POPMAX_AF"] or 0.0) if "g1k_POPMAX_AF" in hit else None, 'exac_v3_AF': float(hit["exac_AF"] or 0.0) if "exac_AF" in hit else (hit["exac_AC_Adj"] / float(hit["exac_AN_Adj"]) if "exac_AC_Adj" in hit and "exac_AN_Adj" in hit and int(hit["exac_AN_Adj"] or 0) > 0 else None), 'exac_v3_popmax_AF': float(hit["exac_AF_POPMAX"] or 0.0) if "exac_AF_POPMAX" in hit else None, 'gnomad_exomes_AF': float(hit["gnomad_exomes_AF"] or 0.0) if "gnomad_exomes_AF" in hit else None, 'gnomad_exomes_popmax_AF': float(hit["gnomad_exomes_AF_POPMAX"] or 0.0) if "gnomad_exomes_AF_POPMAX" in hit else None, 'gnomad_genomes_AF': float(hit["gnomad_genomes_AF"] or 0.0) if "gnomad_genomes_AF" in hit else None, 'gnomad_genomes_popmax_AF': float(hit["gnomad_genomes_AF_POPMAX"] or 0.0) if "gnomad_genomes_AF_POPMAX" in hit else None, 'topmed_AF': float(hit["topmed_AF"] or 0.0) if "topmed_AF" in hit else None, }, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"][ "svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"][ "svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info( "Result %s: GRCh37: %s GRCh38: %s:, cadd: %s %s - gene ids: %s, coding gene_ids: %s" % (i, grch37_coord, grch38_coord, hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = { vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol") } result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["coding_gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} if not genes: for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn( "WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if self._redis_client: self._redis_client.set(cache_key, json.dumps(variant_results)) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ]
# es.indices.delete(raw_index_name) # subprocess.run(['p2o.py', '--enrich', '--index', raw_index_name, # '--index-enrich', enrich_index_name, '-e', 'http://localhost:9200/', # '--no_inc', '--debug', 'github', 'grimoirelab' , 'perceval', # '-t', github_token, '--sleep-for-rate']) response = es.search(index=enrich_index_name) Number_of_commits = response['hits']['total'] print("Total Number of commits :- %s" % (Number_of_commits)) response = es.search(index=enrich_index_name, body={"size": Number_of_commits}) # for i in response['hits']['hits']: # pprint(i['_source']) request = elasticsearch_dsl.Search(using=es, index=enrich_index_name) request = request.source([ 'created_at', 'closed_at', 'time_open_days', 'time_to_close_days', 'item_type', 'id_in_repo' ]) request = request.filter("terms", item_type=['issue']) request = request.filter('range', created_at={'gte': 'now-6M'}) request = request.sort({'created_at': {'order': 'asc'}}) request = request[0:10000] result = request.execute() # pprint(request.to_dict()) # pprint(result.to_dict()) result = result.to_dict() data = []
import elasticsearch import elasticsearch_dsl # get the last commits es = elasticsearch.Elasticsearch(['http://localhost:9200/']) # Build a DSL search object on the `commits` index, `summary` document type request = elasticsearch_dsl.Search(using=es, index='commits', doc_type='summary') request = request.sort('-commit_date') request = request.source(['hash', 'author_date', 'author']) request = request[0:20] # run the search, using the scan interface to get all results response = request.execute() # instead of `scan()` we use `execute()` which allows for slicing, and preserves order. for commit in response: print(commit.hash, commit.author_date, commit.author)
# Import config with open('./config/config.yml', 'r') as ymlconfig: theconfig = yaml.load(ymlconfig) # Setup timer for loop starttime = time.time() # Elastic, parameters should be passed in from config eshttpauth = (theconfig['esusername'] + ':' + theconfig['espassword']) es = elasticsearch.Elasticsearch(hosts=[theconfig['esinstance']], http_auth=eshttpauth, timeout=10, max_retries=3, retry_on_timeout=True) s = elasticsearch_dsl.Search(using=es, index=theconfig['esindexsearch']) #q = Q('bool', must=[Q('range', **{'@timestamp': {'gte': "now-" + theconfig['essearchwindow']}}) & ('match', clientid=theconfig['esclientid'])]) if theconfig['esclientid'] == '': s = s.filter( 'range', **{'@timestamp': { 'gte': "now-" + theconfig['essearchwindow'] }}) else: s = s.filter( 'range', **{ '@timestamp': { 'gte': "now-" + theconfig['essearchwindow'] } }).filter('match', clientid=theconfig['esclientid']) s = s[0:0]