Example #1
0
def _get_elasticsearch_index_samples(elasticsearch_index, project):
    sample_field_suffix = '_num_alt'

    es_client = get_es_client(timeout=30)
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix), 'join_field'],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    #  Nested genotypes
    if field_mapping.get(elasticsearch_index,
                         {}).get('mappings', {}).get(VARIANT_DOC_TYPE,
                                                     {}).get('join_field'):
        max_samples = Individual.objects.filter(
            family__project=project).count()
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='sample_id', size=max_samples))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
Example #2
0
def check_data(request):

    if model.objects.all().count() < 3000:
        return HttpResponse("Too few monumenten data in the database",
                            content_type="text/plain",
                            status=500)

    # check elastic
    try:
        client = elasticsearch.Elasticsearch(settings.ELASTIC_SEARCH_HOSTS)
        x = elasticsearch_dsl.Search().using(client).index(
            settings.ELASTIC_INDICES['MONUMENTEN']).query(
                "match_all").execute()
        assert x.hits.total > 3000

    except elasticsearch.TransportError:
        log.exception("Too few monumenten data in ES database")
        return HttpResponse("Autocomplete failed",
                            content_type="text/plain",
                            status=500)

    return HttpResponse("Data OK", content_type='text/plain', status=200)
Example #3
0
 def _run(self, thing):
     github_client = self.bot.clients.github_client
     elastic_client = self.bot.clients.elastic_client
     replier = functools.partial(self.message.reply_text,
                                 threaded=True,
                                 prefixed=False)
     replier("Initiating scan for `%s`." % thing)
     to_send = {}
     for index, query_tpl in self.index_and_query:
         query = query_tpl % {'thing': thing}
         replier("Scanning index `%s` using query `%s`." % (index, query))
         s = (e_dsl.Search(using=elastic_client).query(
             "query_string", query=query).sort("-@timestamp").index(index))
         s_buf = six.StringIO()
         for i, h in enumerate(s.scan()):
             h_header = "Hit %s" % (i + 1)
             h_header_delim = "-" * len(h_header)
             h_header += "\n"
             h_header += h_header_delim
             h_header += "\n"
             s_buf.write(h_header)
             s_buf.write(_format_hit(h))
             s_buf.write("\n")
         # Github has upper limit on postings to 1MB
         s_buf = self._chop(s_buf, units.Mi)
         if s_buf:
             # Because github...
             s_buf_name = re.sub(r"\.|\-|\*|_", "", index)
             s_buf_name = s_buf_name + ".txt"
             to_send[s_buf_name] = ghe.InputFileContent(s_buf)
     if not to_send:
         replier("No scan results found.")
     else:
         replier("Uploading %s scan results to gist." % len(to_send))
         me = github_client.get_user()
         gist = me.create_gist(True, to_send)
         replier("Gist url at: %s" % gist.html_url)
Example #4
0
def reindex_test_dataset(query: dict = None, from_index: Optional[str] = None, hex_size=20) -> str:
    """
    Reindexes the master test dataset into isolated pieces.
    :param from_index: Index from which to reindex.
    :param query: Query you want to limit the reindex to.
    :param hex_size: How many random characters should there be in the new indexes name.
    :return: Name of the newly generated index.
    """
    from texta_elastic.core import ElasticCore
    from toolkit.test_settings import TEST_INDEX

    from_index = from_index if from_index else TEST_INDEX

    ec = ElasticCore()
    new_test_index_name = f"ttk_test_{uuid.uuid4().hex[:hex_size]}"
    ec.create_index(index=new_test_index_name)
    ec.add_texta_facts_mapping(new_test_index_name)

    from_scan = elasticsearch_dsl.Search() if query is None else elasticsearch_dsl.Search.from_dict(query)
    from_scan = from_scan.index(from_index).using(ec.es)
    from_scan = from_scan.scan()


    def doc_actions(generator):
        for document in generator:
            yield {
                "_index": new_test_index_name,
                "_type": "_doc",
                "_source": document.to_dict(),
                "retry_on_conflict": 3
            }


    actions = doc_actions(from_scan)
    from elasticsearch.helpers import bulk
    bulk(actions=actions, client=ec.es, refresh="wait_for")
    return new_test_index_name
Example #5
0
def _get_elasticsearch_index_samples(elasticsearch_index):
    es_client = get_es_client()

    #  Nested genotypes
    if is_nested_genotype_index(elasticsearch_index):
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='samples_num_alt_1',
                                size=10000))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    sample_field_suffix = '_num_alt'
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix)],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
Example #6
0
    def post(self, request, project_pk: int):
        try:
            serializer = ExportSearcherResultsSerializer(data=request.data)
            model = get_object_or_404(Project, pk=project_pk)
            self.check_object_permissions(request, model)

            serializer.is_valid(raise_exception=True)

            # Use the query as a hash to avoid creating duplicate files.
            query = serializer.validated_data["query"]
            query_str = json.dumps(query, sort_keys=True, ensure_ascii=False)

            indices = model.get_available_or_all_project_indices(serializer.validated_data["indices"])
            indices = ",".join(indices)

            fields = serializer.validated_data["fields"]

            original_query = elasticsearch_dsl.Search().from_dict(query).source(fields)
            with_es = original_query.using(ElasticCore().es)
            index_limitation = with_es.index(indices)
            limit_by_n_docs = index_limitation.extra(size=10000)

            path = pathlib.Path(RELATIVE_PROJECT_DATA_PATH) / str(project_pk) / SEARCHER_FOLDER_KEY
            path.mkdir(parents=True, exist_ok=True)
            file_name = f"{hash_string(query_str)}.jl"

            with open(path / file_name, "w+", encoding="utf8") as fp:
                for item in limit_by_n_docs.scan():
                    item = item.to_dict()
                    json_string = json.dumps(item, ensure_ascii=False)
                    fp.write(f"{json_string}\n")

            url = reverse("protected_serve", kwargs={"project_id": int(project_pk), "application": SEARCHER_FOLDER_KEY, "file_name": file_name})
            return Response(request.build_absolute_uri(url))

        except elasticsearch.exceptions.RequestError:
            return Response({"detail": "Could not parse the query you sent!"}, status=status.HTTP_400_BAD_REQUEST)
Example #7
0
def main():

    db =  {
        'es_hosts' : ["es_host"],
        'dbname' : 'logstash_index_name'
    }
    client = es.Elasticsearch(db['es_hosts'])

    rts = 0
    ts = 0
    with open('missing_ids.txt', 'r') as f:
        for line in f:
            s = dsl.Search(using=client, index=db['dbname'])
            s = s.query('match', id_str=line)
            for hit in s.execute():
                if 'retweeted_status' in hit:
                    rts += 1
                else:
                    ts += 1

                print('TWEETS: ' + str(ts) + ' RTs: ' + str(rts), end='\r')

    print('TWEETS   : ', ts )
    print('RETWEETS : ', rts)
Example #8
0
import os
import sys

if '__file__' in vars():
    project_path = os.path.abspath(
        os.path.join(__file__, os.path.pardir, os.path.pardir, os.path.pardir))
    print('\n Adding path: ', project_path)
    sys.path.append(project_path)

# Own code
from config import *

# Connection to Elasticsearch
con = es.Elasticsearch('localhost')
search_content = es_dsl.Search(using=con, index='netgear')
max_count = search_content.count()
res_content = search_content[0:max_count].execute()

# Content of netgear
res_filtered = [x['_source'].to_dict() for x in res_content['hits']['hits']]
A = pd.DataFrame.from_dict(res_filtered)

# Connection to MySQL
connection = pymysql.connect(host='localhost',
                             user=MYSQL_USER,
                             password=os.environ['seb_mysql_key'],
                             db=DB_NAME_NETGEAR)

cursor = connection.cursor()
cursor.execute('describe netgear')
Example #9
0
from elasticsearch.helpers import bulk
from datetime import date, timedelta

batchFileName = 'UpdateWeather.json'
es_host = 'elastic00:9200'
srchDate = date(2018, 2, 5)
oneDay = timedelta(days=1)
srchDate1 = srchDate + oneDay
serial = 0
numDay = srchDate.year * 10000 + srchDate.month * 100 + srchDate.day
numDay = numDay * 1000000
esconn = dsl.connections.create_connection(hosts=es_host, timeout=5)

batchFile = open(batchFileName, 'w')

s = dsl.Search(index='weather-*').query("range", time={"gte": srchDate, "lt": srchDate1})\
       .sort('time')[0:25000]

batchList = []

for h in s.execute().hits:
    #    print(h.meta.id)
    serial += 1
    newTsa = numDay + serial
    item = {
        '_index': h.meta.index,
        '_op_type': 'update',
        '_type': 'doc',
        '_id': h.meta.id,
        'doc': {
            'tsa': newTsa
        }
Example #10
0
    def search_with_query(self, query: EsQuery):
        logger.info("search_with_query called with query={}".format(query))
        if query.split_results:
            ms = es_dsl.MultiSearch(using=self.es)

            for resource in query.resources:
                s = es_dsl.Search(index=resource)

                if query.query is not None:
                    s = s.query(query.query)
                s = s[query.from_ : query.from_ + query.size]
                if query.sort:
                    s = s.sort(*self.translate_sort_fields([resource], query.sort))
                elif resource in query.sort_dict:
                    s = s.sort(
                        *self.translate_sort_fields(
                            [resource], query.sort_dict[resource]
                        )
                    )
                ms = ms.add(s)

            responses = ms.execute()
            result = {"total": 0, "hits": {}}
            for i, response in enumerate(responses):
                result["hits"][query.resources[i]] = self._format_result(
                    query.resources, response
                ).get("hits", [])
                result["total"] += response.hits.total
                if query.lexicon_stats:
                    if "distribution" not in result:
                        result["distribution"] = {}
                    result["distribution"][query.resources[i]] = response.hits.total
            return result
        else:
            s = es_dsl.Search(using=self.es, index=query.resource_str)
            if query.query is not None:
                s = s.query(query.query)

            # s = s[query.from_ : query.from_ + query.size]

            if query.lexicon_stats:
                s.aggs.bucket(
                    "distribution", "terms", field="_index", size=len(query.resources)
                )
            if query.sort:
                s = s.sort(*self.translate_sort_fields(query.resources, query.sort))
            elif query.sort_dict:
                sort_fields = []
                for resource, sort in query.sort_dict.items():
                    sort_fields.extend(self.translate_sort_fields([resource], sort))
                s = s.sort(*sort_fields)
            logger.debug("s = {}".format(s.to_dict()))
            response = self.execute_query(s, from_=query.from_, size=query.size)

            # TODO format response in a better way, because the whole response takes up too much space in the logs
            # logger.debug('response = {}'.format(response.to_dict()))

            # print(f"{response=}")
            result = self._format_result_dict(query.resources, response)
            if query.lexicon_stats:
                if "aggregations" not in response:
                    response = self.execute_query(s, from_=0, size=0)
                result["distribution"] = {}
                for bucket in response["aggregations"]["distribution"]["buckets"]:
                    key = bucket["key"]
                    value = bucket["doc_count"]
                    result["distribution"][key.rsplit("_", 1)[0]] = value

            return result
Example #11
0
def elastic_search(request, resourcetype='base'):
    parameters = request.GET
    es = Elasticsearch(settings.ES_URL)

    # exclude the profile and group indexes.
    # They aren't being used, and cause issues with faceting
    indices = es.indices.get_alias("*").keys()
    exclude_indexes = ['profile-index', 'group-index']
    [indices.remove(i) for i in exclude_indexes if i in indices]

    search = elasticsearch_dsl.Search(using=es, index=indices)
    search = get_base_query(search)
    search = apply_base_filter(request, search)

    # Add facets to search
    for fn in get_facet_fields():
        search.aggs.bucket(
            fn,
            'terms',
            field=fn,
            order={"_count": "desc"},
            size=parameters.get("nfacets", 15)
        )

    # run search only filtered by what a particular user is able to see
    # this makes sure to get every item that is possible in the facets
    # in order for a UI to build the choices
    overall_results = search[0:0].execute()
    facet_results = get_facet_results(overall_results.aggregations, parameters)

    search = filter_by_resource_type(search, resourcetype)
    search = get_main_query(search, parameters.get('q', None))

    # Add the facet queries to the main search
    for fq in get_facet_filter(parameters):
        search = search.query(fq)

    # Add in has_time filter if set
    if parameters.get("has_time", False):
        search = search.query(Q({'match': {'has_time': True}}))

    search = add_bbox_search(search, parameters.get("extent", None))
    search = add_temporal_search(search, parameters)
    search = apply_sort(search, parameters.get("order_by", "relevance"))

    limit = int(parameters.get('limit', settings.API_LIMIT_PER_PAGE))
    offset = int(parameters.get('offset', '0'))

    # Run the search using the offset and limit
    search = search[offset:offset + limit]
    results = search.execute()

    logger.debug('search: {}, results: {}'.format(search, results))

    filtered_facet_results = filter_results_by_facets(
        results.aggregations,
        facet_results
    )
    # Get results
    objects = get_unified_search_result_objects(results.hits.hits)

    object_list = {
        "meta": {
            "limit": limit,
            "next": None,
            "offset": offset,
            "previous": None,
            "total_count": results.hits.total,
            "facets": filtered_facet_results,
        },
        "objects": objects,
    }

    return JsonResponse(object_list)
Example #12
0
"""Simple script to dump some database raw data into a file for use in tests."""

import json
from typing import Any, Dict, List

from elasticsearch import Elasticsearch
import elasticsearch_dsl as es

from ingress.utils import setup_mappings

setup_mappings('tweets-brexit-remain-leave', 'localhost:9200')

client = Elasticsearch()

search = es.Search(using=client)

geotagged_records = search.query('match', geotagged=True)[0:10].execute()
untagged_records = search.query('match', geotagged=False)[0:10].execute()

combined_records: List[Dict[str, Any]] = []
combined_records.extend(geotagged_records.hits)
combined_records.extend(untagged_records.hits)

geotagged_data = []

for hit in combined_records:
    record = {}
    for attr in dir(hit):
        attr_data = getattr(hit, attr)
        if isinstance(attr_data, es.AttrDict):
            record[attr] = attr_data.to_dict()
Example #13
0
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
##
## Authors:
##   Jesus M. Gonzalez-Barahona <*****@*****.**>
##
## Modified by:
##   David Arroyo Menéndez <*****@*****.**>

import elasticsearch
import elasticsearch_dsl
from pprint import pprint
# ElasticSearch instance (url)
es = elasticsearch.Elasticsearch(['http://localhost:9200/'])

# Build a DSL Search object on the 'commits' index, 'summary' documents type
request = elasticsearch_dsl.Search(using=es,
                                   index='dam-index',
                                   doc_type='item')

# Run the Search, using the scan interface to get all resuls
response = request.scan()
for r in response:
    pprint(r)
Example #14
0
def search(es_client, request):
    return elasticsearch_dsl.Search(using=es_client.conn,
                                    index=es_client.index)
Example #15
0
def cdr_images_after(es_instance,
                     index,
                     image_types,
                     crawled_after=None,
                     inserted_after=None,
                     agg_img_types=False,
                     domain='weapons'):
    """
    Return query and return an iterator over ES entries.

    Results yielded in ascending CDR insertion order (i.e. FIFO). This should
    cause slicing to be stable.

    :param es_instance: elasticsearch.Elasticsearch instance.
    :type es_instance:

    :param index: ElasticSearch index to draw from.
    :type index: str

    :param image_types: List of image content type suffixes
        (e.g. ['png', 'jpeg'])
    :type image_types:

    :param crawled_after: Optional timestamp to constrain query elements to
        only those collected/crawled after this time.
    :type crawled_after: datetime.datetime

    :param inserted_after: Optional timestamp to constrain query elements to
        only those inserted into the ES instance/index after this time.
    :param inserted_after: datetime.datetime

    :param agg_img_types: If we should add an aggregation on image types to the
        query (prevents scanning).

    :param domain: The _type to filter by. "weapons" by default.

    :return:
    :rtype:

    """
    log = logging.getLogger(__name__)
    log.info("Forming ES CDR image query for types: %s", image_types)

    base_search = elasticsearch_dsl.Search()\
        .using(es_instance)\
        .index(index)\
        .fields(['_id', '_timestamp', '_type',
                 'content_type', 'obj_original_url', 'obj_stored_url',
                 'timestamp', 'version',
                 ])

    if domain:
        base_search = base_search.doc_type(domain)

    # I think `_type` filter is redundant with `doc_type` specification above
    if elasticsearch_dsl.VERSION[0] == 2:
        # ES 2.x version
        f = Q('term', version='2.0') \
            & Q('term', content_type='image')
        if image_types:
            f &= Q('terms', content_type=image_types)
        if domain:
            log.debug("Constraining _type: %s", domain)
            f &= Q('term', _type=domain)
        if crawled_after:
            log.debug("Constraining to entries crawled after: %s",
                      crawled_after)
            f &= Q('range', timestamp={'gt': crawled_after})
        if inserted_after:
            log.debug("Constraining to entries inserted after: %s",
                      inserted_after)
            f &= Q('range', _timestamp={'gt': inserted_after})
    else:
        # ES 1.x version
        from elasticsearch_dsl.filter import F
        f = F('term', version='2.0') \
            & F('term', content_type='image')
        if image_types:
            f &= F('terms', content_type=image_types)
        if domain:
            log.debug("Constraining _type: %s", domain)
            f &= F('term', _type=domain)
        if crawled_after:
            log.debug("Constraining to entries crawled after: %s",
                      crawled_after)
            f &= F('range', timestamp={'gt': crawled_after})
        if inserted_after:
            log.debug("Constraining to entries inserted after: %s",
                      inserted_after)
            f &= F('range', _timestamp={'gt': inserted_after})

    q = base_search\
        .filter(f)\
        .sort({'_timestamp': {"order": "asc"}})

    if agg_img_types:
        log.debug("Aggregating image content type information")
        q.aggs.bucket('per_type', 'terms', field='content_type')

    return q
                        '--timeout',
                        type=int,
                        help='data validity timeout (in minutes)',
                        default=5)
    parser.add_argument('-E',
                        '--elastichost',
                        type=str,
                        help='connection URL of ElasticSearch server',
                        default="localhost:9200")
    args = parser.parse_args()

    escnx = elasticsearch_dsl.connections.create_connection(
        hosts=[args.elastichost], timeout=20)
    # construct an Elasticsearch DSL Search() object, using Q() shortcuts to build the query
    request = elasticsearch_dsl.Search(using=escnx,
                                       index="metricbeat-*",
                                       doc_type='_doc')
    request = request.query(
        'bool',
        must=[
            'match_all',
            # as Q shortcut doesn't support nested keywords (eg. 'agent.type' for instance)
            # we must provide keywords as kwargs dict type
            Q('match', **{'agent.type': 'metricbeat'}),
            Q('match', **{'host.name': args.hostname}),
            Q('exists',
              **{'field': 'windows.perfmon.system.processor_queue_length'})
        ])
    # we'll output 'windows.perfmon.system.processor_queue_length' and '@timestamp' fields
    request = request.source(
        ['@timestamp', 'windows.perfmon.system.processor_queue_length'])
def evaluate_score(student, client, num_resp: int = 25):
    """Takes a student, represented as a dictionary and an elasticsearch-py client and returns an elastic response

    See above student class for schema
    """

    # Adjust weights here:
    base_score = 1.0
    company_score = 1.0
    rural_score = 2.0
    tags_score = 3.0
    underrep_score = 1.0
    # Timezone weights are found in the timezone script query

    s = elasticsearch_dsl.Search(using=client,
                                 index="mentors_index").extra(explain=True)

    # Start by filtering the search by track
    s = s.filter("term", track=student["track"])

    # And also by requireExtended
    if student["requireExtended"]:
        s = s.filter("term", okExtended="true")

    if not student["underrepresented"]:
        s = s.exclude("term", preferStudentUnderRep=2)

    # Adds one to all remaining entries in order to be sure that, in the worst case,
    # there are enough responses, even if they aren't a good fit
    base_value = Q("constant_score", filter=MatchAll(), boost=base_score)

    # Uses a fuzzy query to determine if a student is interested in the mentor's company,
    # then if so adds `weight` to the score
    company_q = None
    for company in student["interestCompanies"]:
        if company_q is None:
            company_q = Q(
                "function_score",
                query=Q("fuzzy", company=company),
                weight=company_score,
                boost_mode="replace",
            )
        else:
            company_q = company_q | Q(
                "function_score",
                query=Q("fuzzy", company=company),
                weight=company_score,
                boost_mode="replace",
            )

    if student["rural"]:
        # If background_rural matches on mentor and student, then add one to the score
        background_rural = Q("constant_score",
                             filter=Q("term",
                                      backgroundRural=student["rural"]),
                             boost=rural_score)
    else:
        background_rural = Q("constant_score", filter=MatchNone())

    # Adds `weight` * the number of matching tags to score
    tags_matching = None
    num_interests = len(student["interestTags"])
    for interest in student["interestTags"]:
        if tags_matching is None:
            tags_matching = Q(
                "function_score",
                query=Q("term", proj_tags=interest),
                weight=tags_score / num_interests,
                boost_mode="replace",
            )
        else:
            tags_matching = tags_matching | Q(
                "function_score",
                query=Q("term", proj_tags=interest),
                weight=tags_score / num_interests,
                boost_mode="replace",
            )

    combined_query = (
        base_value
        | tags_matching
        | company_q
        | background_rural
        # | prefer_student_underrep
    )

    # Decay the combined score based on the number of students who already voted for that
    combined_query = Q("function_score",
                       query=combined_query,
                       functions=SF("gauss",
                                    numStudentsSelected={
                                        "origin": 0,
                                        "scale": 3,
                                        "offset": 3,
                                        "decay": 0.50
                                    }))

    # Timezone - this one's a bit more complex. See comments in script for more details.
    # Multiplies it's value by the previous scores, allowing it to reduce, set to zero, and increase scores.
    # See below string for python implementation
    """
    if mentor['okTimezoneDifference']:
        if 16 < student['timezone'] < 22:
            return True
        return false
    else:
        if abs(student['timezone'] - mentor['timezone']) < 3:
            return True
        return False
    """

    s = s.query(combined_query)[0:num_resp]
    resp = s.execute()
    return resp
Example #18
0
    def update_histogram_bins(self, log, log_type):
        logger.debug("updating " + log_type + " histogram bins")

        try:
            search = es_dsl.Search(
                using=self.client, index=self.prefix + '_monitor_data') \
                .filter('match', _type='fields') \
                .filter('match', _id='intervals') \
                .extra(size=1)

            intervals = search.execute()[0].to_dict()

            fields = [
                '.'.join(path.split('.')[:-1])
                for path in nested_paths(intervals[log_type])
                if path.endswith('interval')
            ]

            for field in fields:
                cur_val = nested_get(log, field)
                if cur_val is None:
                    break

                field_path = log_type + '.' + field
                intervals_field = nested_get(intervals, field_path)

                changed = False
                if intervals_field['interval'] is None:
                    intervals_field['min'] = cur_val
                    intervals_field['max'] = cur_val
                    changed = True
                else:
                    if cur_val < intervals_field['min']:
                        intervals_field['min'] = cur_val
                        changed = True
                    elif cur_val > intervals_field['max']:
                        intervals_field['max'] = cur_val
                        changed = True

                if changed:
                    if intervals_field['min'] == intervals_field['max']:
                        intervals_field['interval'] = 1
                    else:
                        intervals_field['interval'] = \
                            math.ceil((intervals_field['max'] -
                                       intervals_field['min']) / 20.0)

                    for vis_id in intervals_field['vis_ids']:
                        search_vis = es_dsl.Search(
                            using=self.client, index='.kibana') \
                            .filter('match', _id=vis_id) \
                            .filter('match', _type='visualization') \
                            .extra(size=1)

                        vis = search_vis.execute()[0]
                        vis_state = json.loads(vis.visState)

                        for agg in vis_state['aggs']:
                            if agg['type'] == 'histogram' and \
                                    agg['params']['field'] == field_path:
                                agg['params']['interval'] = \
                                    intervals_field['interval']

                        vis.visState = json.dumps(vis_state, sort_keys=True)

                        vis_source = json.loads(
                            vis.kibanaSavedObjectMeta.searchSourceJSON)

                        filter_words = vis_source['query']['query_string'][
                            'query'].split(' ')

                        for i, word in enumerate(filter_words):
                            if word.startswith(field_path + ':>='):
                                filter_words[i] = field_path + ':>=' + \
                                    str(intervals_field['min'])
                            elif word.startswith(field_path + ':<='):
                                filter_words[i] = field_path + ':<=' + \
                                    str(intervals_field['max'])

                        vis_source['query']['query_string']['query'] =\
                            ' '.join(filter_words)

                        vis.kibanaSavedObjectMeta.searchSourceJSON = \
                            json.dumps(vis_source, sort_keys=True)

                        self.client.index(index='.kibana',
                                          doc_type='visualization',
                                          id=vis_id,
                                          body=vis.to_dict())

                nested_set(intervals, log_type + '.' + field, intervals_field)

            self.client.index(index=self.prefix + '_monitor_data',
                              doc_type='fields',
                              id='intervals',
                              body=intervals)
        except Exception as e:
            logger.error(e)
Example #19
0
import elasticsearch
import elasticsearch_dsl

client = elasticsearch.Elasticsearch(['localhost'])

s = elasticsearch_dsl.Search(using=client, index="salt-status_diskusage-v1") \
    .query('match', minion='minion1') \
    .source(['@timestamp',
             'data./etc/hosts.available',
             'data./etc/hosts.total']) \
    .sort("-@timestamp") \
    .extra(size=1)

response = s.execute().to_dict()
source = response['hits']['hits'][0]['_source']
data = source['data']
print('Raw /etc/hosts Disk data from returner:', data)

#############################
# Get Disk Usage
s = elasticsearch_dsl.Search(using=client, index="salt-disk_percent-v1") \
    .query('match', minion='minion2') \
    .source(['@timestamp', 'data./']) \
    .sort("-@timestamp") \
    .extra(size=1)

response = s.execute().to_dict()
source = response['hits']['hits'][0]['_source']
data = source['data']
print('/ Disk data from returner: ', data)
Example #20
0
def get_all_paths() -> Set[str]:
    search = (elasticsearch_dsl.Search(
        using=get_session(),
        index=config.config['elasticsearch']['index'],
        doc_type=ES_DOC_TYPE).source(['path']))
    return set(h.path for h in search.scan())
def count_by_city_order_by_country(
        vk_elastic_db: es_client.VkDataDatabaseClient,
        size=10,
        is_need_other=True,
        is_need_print=False,
        is_need_plot=True,
        is_need_active=False,
        days_delta=20):
    country_aggs_name = "country_count"
    city_aggs_name = "city_count"
    title = "count by city"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")])
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="city.title.keyword")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", city__title__keyword="")])
    a = elasticsearch_dsl.A('terms',
                            field="country.title.keyword",
                            size=size,
                            collect_mode="breadth_first")
    a1 = elasticsearch_dsl.A('terms', field="city.title.keyword", size=size)
    s.aggs.bucket(country_aggs_name, a).bucket(city_aggs_name, a1)
    response = s.execute()

    data_dict = {}
    for country_hit in response.aggregations[country_aggs_name].buckets:

        x_axis = [hit.key for hit in country_hit[city_aggs_name].buckets]
        y_axis = [hit.doc_count for hit in country_hit[city_aggs_name].buckets]
        if is_need_other:
            x_axis.append("other")
            y_axis.append(country_hit[city_aggs_name].sum_other_doc_count)
        data_dict[country_hit.key] = {}
        data_dict[country_hit.key]["x_axis"] = x_axis
        data_dict[country_hit.key]["y_axis"] = y_axis

    for country in data_dict:
        x_axis = data_dict[country]["x_axis"]
        y_axis = data_dict[country]["y_axis"]
        cur_title = f"{title}\n{country}"
        figname = f"{title.replace(' ', '_')}_{country}"
        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png",
                        dpi=300,
                        format='png',
                        bbox_inches='tight')
            plt.close(fig)
Example #22
0
def es_dsl_queryset():
    return elasticsearch_dsl.Search()
Example #23
0
 def generator(self):
     """
     main generator function for IDFile and IDFileConsume
     searching with an set of IDs can take quite long time
     better would be to reduce the set of documents to a pure idlist, this is quite fast over mget
     often, its needed to do it with a search, therefore both ways work
     """
     missing = []  # an iterable containing missing ids
     while len(self.ids) > 0:
         if self.body:
             ms = elasticsearch_dsl.MultiSearch(
                 using=self.es, index=self.index,
                 doc_type=self.type_)  # setting up MultiSearch
             this_iter_ids = self.ids[:self.
                                      chunksize]  # an ID List per iteration, so we can check if all the IDs of this chunksize are found at the end.
             for _id in this_iter_ids:  # add a search per ID
                 ms = ms.add(elasticsearch_dsl.Search().source(
                     excludes=self.source_excludes,
                     includes=self.source_includes).from_dict(
                         self.body).query("match", _id=_id))
             responses = ms.execute()
             for response in responses:
                 for hit in response:
                     _id = hit.meta.to_dict()["id"]
                     yield self.return_doc(hit)
                     del self.ids[self.ids.index(_id)]
                     del this_iter_ids[this_iter_ids.index(_id)]
             for _id in this_iter_ids:
                 """
                 unfortunately MultiSearch doesn't throw an exception for non-Found-IDs, so we have manually check for missing ids
                 so we again iterate over the helper_list with the IDs per chunk size (simply doing self.dis[:self.chunksize] would give us a new set)
                 and we put all the IDs who are still in there in our missing list and delete them from self.ids and this_iter_ids
                 """
                 missing.append(_id)
                 del self.ids[self.ids.index(_id)]
                 del this_iter_ids[this_iter_ids.index(_id)]
         else:
             try:
                 s = elasticsearch_dsl.Document.mget(
                     docs=self.ids[:self.chunksize],
                     using=self.es,
                     index=self.index,
                     _source_excludes=self.source_excludes,
                     _source_includes=self.source_includes,
                     _source=self.source,
                     missing='raise')
             except elasticsearch.exceptions.NotFoundError as e:
                 for doc in e.info[
                         'docs']:  # we got some missing ids and harvest the missing ids from the Elasticsearch NotFoundError Exception
                     missing.append(doc['_id'])
                     del self.ids[self.ids.index(doc['_id'])]
             else:  # only gets called if we don't run into an exception
                 for hit in s:
                     _id = hit.meta.to_dict()["id"]
                     yield self.return_doc(hit)
                     del self.ids[self.ids.index(_id)]
         if not self.ids:
             """
             if we delete the last item from ids,
             ids turns to None and then the while(len(list()))
             would throw an exception, since None isn't an iterable
             """
             self.ids = []
     for item in self.write_file(missing):
         yield item
Example #24
0
    def download_templates(self):
        logger.info("getting Kibana objects with prefix " + self.prefix)

        logger.info("getting index patterns")
        index_dir = os.path.join(self.template_dir, 'index')
        try:
            try:
                os.mkdir(self.template_dir)
                os.mkdir(os.path.join(self.template_dir, 'index'))
            except OSError:
                pass

            search_index = es_dsl.Search(using=self.client, index='.kibana') \
                .filter('prefix', _id=self.prefix) \
                .filter('match', _type='index-pattern') \
                .extra(size=10000)
            response_index = search_index.execute()

            for index in response_index:
                index.meta.id = index.meta.id \
                    .replace(self.prefix, '[template]')
                index.title = index.title.replace(self.prefix, '[template]')

                with open(
                        os.path.join(index_dir, index.meta.id) + '.json',
                        'w') as f:
                    f.write(
                        json.dumps(index.to_dict(), indent=4, sort_keys=True))
                    f.write('\n')
        except Exception as e:
            logger.error(e)

        dash_dir = os.path.join(self.template_dir, 'dash')
        intervals = {}
        for name in self.dashboards:
            logger.info("getting " + name + " dashboard")
            vis_ids = []
            try:
                try:
                    os.mkdir(os.path.join(dash_dir))
                except OSError:
                    pass

                search_dash = es_dsl.Search(
                    using=self.client, index='.kibana') \
                    .filter('match', _id=self.prefix + '-' + name) \
                    .filter('match', _type='dashboard') \
                    .extra(size=1)
                dash = search_dash.execute()[0]

                dash.meta.id = dash.meta.id \
                    .replace(self.prefix, '[template]')
                dash.title = dash.title.replace(self.prefix, '[template]')

                dash_panels = json.loads(dash.panelsJSON)
                for panel in dash_panels:
                    vis_ids.append(panel['id'])
                    panel['id'] = panel['id'].replace(self.prefix,
                                                      '[template]')
                dash.panelsJSON = json.dumps(dash_panels, sort_keys=True)

                with open(os.path.join(dash_dir, dash.meta.id) + '.json',
                          'w') as f:
                    f.write(
                        json.dumps(dash.to_dict(), indent=4, sort_keys=True))
                    f.write('\n')
            except Exception as e:
                logger.error(e)

            logger.info("getting " + name + " visualizations")
            vis_dir = os.path.join(self.template_dir, 'vis')
            try:
                os.mkdir(vis_dir)
            except OSError:
                pass
            for vis_id in vis_ids:
                try:
                    search_vis = es_dsl.Search(
                        using=self.client, index='.kibana') \
                        .filter('match', _id=vis_id) \
                        .filter('match', _type='visualization') \
                        .extra(size=1)

                    vis = search_vis.execute()[0]
                    vis.meta.id = vis.meta.id \
                        .replace(self.prefix, '[template]')
                    vis.title = vis.title \
                        .replace(self.prefix, '[template]')

                    vis_state = json.loads(vis.visState)
                    vis_state['title'] = vis['title']

                    if vis_state['type'] == 'markdown':
                        vis_state['params']['markdown'] = "text goes here"
                    else:
                        vis_source = json.loads(
                            vis.kibanaSavedObjectMeta.searchSourceJSON)
                        vis_source['index'] = vis_source['index'].replace(
                            self.prefix, '[template]')

                        if vis_state['type'] == 'histogram':
                            hist_aggs = [
                                agg for agg in vis_state['aggs']
                                if agg['type'] == 'histogram'
                            ]
                            for agg in hist_aggs:
                                agg['params']['interval'] = 1e10
                                field_path = agg['params']['field']

                                filter_words = vis_source['query'][
                                    'query_string']['query'].split(' ')
                                filter_found = False

                                for i, word in enumerate(filter_words):
                                    if word.startswith(field_path + ':>='):
                                        filter_words[i] = \
                                            field_path + ':>=0'
                                        filter_found = True
                                    elif word.startswith(field_path + ':<='):
                                        filter_words[i] = \
                                            field_path + ':<=0'
                                        filter_found = True

                                if not filter_found:
                                    if len(filter_words) > 0:
                                        filter_words += \
                                            ['AND', field_path + ':>=0',
                                             'AND', field_path + ':<=0']
                                    else:
                                        filter_words = \
                                            [field_path + ':>=0',
                                             'AND', field_path + ':<=0']

                                vis_source['query']['query_string']['query'] =\
                                    ' '.join(filter_words)

                                vis_ids = nested_get(intervals,
                                                     field_path + '.vis_ids')

                                if vis_ids and vis.meta.id not in vis_ids:
                                    vis_ids.append(vis.meta.id)
                                else:
                                    vis_ids = [vis.meta.id]

                                hist_data = {
                                    'interval': None,
                                    'min': None,
                                    'max': None,
                                    'vis_ids': vis_ids
                                }

                                nested_set(intervals, agg['params']['field'],
                                           hist_data)
                        elif vis_state['type'] == 'table':
                            if vis.meta.id == '[template]-Category-summary':
                                vis_state['params']['perPage'] = 0
                                aggs = [
                                    agg for agg in vis_state['aggs'] if
                                    'params' in agg and 'size' in agg['params']
                                ]
                                for agg in aggs:
                                    agg['params']['size'] = 0
                            elif vis.meta.id == '[template]-Workflow-summary':
                                vis_state['params']['perPage'] = 0
                                aggs = [
                                    agg for agg in vis_state['aggs'] if
                                    'params' in agg and 'size' in agg['params']
                                ]
                                for agg in aggs:
                                    agg['params']['size'] = 0

                        vis.kibanaSavedObjectMeta.searchSourceJSON = \
                            json.dumps(vis_source, sort_keys=True)

                    vis.visState = json.dumps(vis_state, sort_keys=True)

                    with open(
                            os.path.join(vis_dir, vis.meta.id) + '.json',
                            'w') as f:
                        f.write(
                            json.dumps(vis.to_dict(), indent=4,
                                       sort_keys=True))
                        f.write('\n')
                except Exception as e:
                    logger.error(e)

        try:
            with open(
                    os.path.join(self.template_dir, 'intervals') + '.json',
                    'w') as f:
                f.write(json.dumps(intervals, indent=4, sort_keys=True))
        except Exception as e:
            logger.error(e)
Example #25
0
def es_dsl_search(pyramid_request):
    return elasticsearch_dsl.Search(using=pyramid_request.es.conn,
                                    index=pyramid_request.es.index)
Example #26
0
def elasticsearch_status(request):
    client = get_es_client()

    # get index snapshots
    response = requests.get("http://{0}:{1}/_snapshot/{2}/_all".format(
        settings.ELASTICSEARCH_SERVICE_HOSTNAME, settings.ELASTICSEARCH_PORT,
        "callsets"))
    snapshots = json.loads(response.content)

    index_snapshot_states = defaultdict(list)
    for snapshot in snapshots.get("snapshots", []):
        for index_name in snapshot.get("indices", []):
            index_snapshot_states[index_name].append(snapshot["state"])

    # get indices
    indices = []
    for index in client.cat.indices(format="json", h="*"):
        index_name = index['index']

        # skip special indices
        if index_name in ['.kibana', 'index_operations_log']:
            continue

        index_json = {k.replace('.', '_'): v for k, v in index.items()}

        index_name = re.sub("_[0-9]{1,2}$", "", index_name)
        sample = Sample.objects.filter(
            elasticsearch_index=index_name).select_related(
                'individual__family__project').first()
        if sample:
            project = sample.individual.family.project
            index_json['project_guid'] = project.guid
            index_json['project_id'] = project.deprecated_project_id
            index_json['dataset_type'] = sample.sample_type
            index_json['genome_version'] = project.genome_version
            index_json['dataset_file_path'] = sample.dataset_file_path

        if index_name in index_snapshot_states:
            index_json['snapshots'] = ", ".join(
                set(index_snapshot_states[index_name]))
        indices.append(index_json)

    # get operations log
    s = elasticsearch_dsl.Search(using=client, index=OPERATIONS_LOG)
    s = s.params(size=5000)
    operations = [doc.to_dict() for doc in s.execute().hits]

    #making a new list since dots in es client keys are confusing template
    disk_status = []
    for disk in client.cat.allocation(format="json"):
        disk_json = {k.replace('.', '_'): v for k, v in disk.items()}
        disk_status.append({
            'node_name': disk_json['node'],
            'disk_available': disk_json['disk_avail'],
            'disk_used': disk_json['disk_used'],
            'disk_percent_used': disk_json['disk_percent'],
        })

    return render(
        request, "staff/elasticsearch_status.html", {
            'indices': indices,
            'operations': operations,
            'disk_stats': disk_status,
            'elasticsearch_host': settings.ELASTICSEARCH_SERVER,
        })
Example #27
0
    def get_elasticsearch_variants(
        self,
        project_id,
        family_id=None,
        variant_filter=None,
        genotype_filter=None,
        variant_id_filter=None,
        quality_filter=None,
        indivs_to_consider=None,
        include_all_consequences=False,
        user=None,
        max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT,
    ):
        from xbrowse_server.base.models import Individual
        from xbrowse_server.mall import get_reference

        cache_key = "Variants___%s___%s___%s" % (project_id, family_id,
                                                 json.dumps([
                                                     variant_filter.toJSON() if
                                                     variant_filter else None,
                                                     genotype_filter,
                                                     quality_filter,
                                                     variant_id_filter,
                                                     indivs_to_consider,
                                                     include_all_consequences,
                                                 ]))

        cached_results = self._redis_client and self._redis_client.get(
            cache_key)
        if cached_results is not None:
            variant_results = json.loads(cached_results)
            return [
                Variant.fromJSON(variant_json)
                for variant_json in variant_results
            ]

        if indivs_to_consider is None:
            if genotype_filter:
                indivs_to_consider = genotype_filter.keys()
            else:
                indivs_to_consider = []

        if family_id is not None:
            family_individual_ids = [
                i.indiv_id for i in Individual.objects.filter(
                    family__family_id=family_id).only("indiv_id")
            ]
        else:
            family_individual_ids = [
                i.indiv_id for i in Individual.objects.filter(
                    family__project__project_id=project_id).only("indiv_id")
            ]

        from xbrowse_server.base.models import Project, Family
        from pyliftover.liftover import LiftOver

        query_json = self._make_db_query(genotype_filter, variant_filter)

        try:
            if self.liftover_grch38_to_grch37 is None:
                self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19')

            if self.liftover_grch37_to_grch38 is None:
                self.liftover_grch37_to_grch38 = None  # LiftOver('hg19', 'hg38')
        except Exception as e:
            logger.info(
                "WARNING: Unable to set up liftover. Is there a working internet connection? "
                + str(e))

        if family_id is None:
            project = Project.objects.get(project_id=project_id)
            elasticsearch_index = project.get_elasticsearch_index()
            logger.info("Searching in project elasticsearch index: " +
                        str(elasticsearch_index))
        else:
            family = Family.objects.get(project__project_id=project_id,
                                        family_id=family_id)
            elasticsearch_index = family.get_elasticsearch_index()
            project = family.project
            logger.info("Searching in family elasticsearch index: " +
                        str(elasticsearch_index))

        if family_id is not None and len(family_individual_ids) > 0:
            # figure out which index to use
            # TODO add caching
            matching_indices = []
            mapping = self._es_client.indices.get_mapping(
                str(elasticsearch_index) + "*")

            if family_individual_ids:
                indiv_id = _encode_name(family_individual_ids[0])
                for index_name, index_mapping in mapping.items():
                    if indiv_id + "_num_alt" in index_mapping["mappings"][
                            "variant"]["properties"]:
                        matching_indices.append(index_name)

            if not matching_indices:
                if not family_individual_ids:
                    logger.error("no individuals found for family %s" %
                                 (family_id))
                elif not mapping:
                    logger.error(
                        "no es mapping found for found with prefix %s" %
                        (elasticsearch_index))
                else:
                    logger.error("%s not found in %s:\n%s" %
                                 (indiv_id, elasticsearch_index,
                                  pformat(index_mapping["mappings"]["variant"]
                                          ["properties"])))
            else:
                logger.info("matching indices: " + str(elasticsearch_index))
                elasticsearch_index = ",".join(matching_indices)

        s = elasticsearch_dsl.Search(using=self._es_client,
                                     index=str(elasticsearch_index) +
                                     "*")  #",".join(indices))

        if variant_id_filter is not None:
            variant_id_filter_term = None
            for variant_id in variant_id_filter:
                q_obj = Q('term', **{"variantId": variant_id})
                if variant_id_filter_term is None:
                    variant_id_filter_term = q_obj
                else:
                    variant_id_filter_term |= q_obj
            s = s.filter(variant_id_filter_term)

        if indivs_to_consider:
            atleast_one_nonref_genotype_filter = None
            for sample_id in indivs_to_consider:
                encoded_sample_id = _encode_name(sample_id)
                q = Q('range', **{encoded_sample_id + "_num_alt": {'gte': 1}})
                if atleast_one_nonref_genotype_filter is None:
                    atleast_one_nonref_genotype_filter = q
                else:
                    atleast_one_nonref_genotype_filter |= q

            s = s.filter(atleast_one_nonref_genotype_filter)

        if quality_filter is not None and indivs_to_consider:
            #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46
            min_ab = quality_filter.get('min_ab')
            if min_ab is not None:
                min_ab /= 100.0  # convert to fraction
            min_gq = quality_filter.get('min_gq')
            vcf_filter = quality_filter.get('vcf_filter')
            for sample_id in indivs_to_consider:
                encoded_sample_id = _encode_name(sample_id)

                #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46
                if min_ab:
                    s = s.filter(
                        ~Q('term', **{encoded_sample_id + "_num_alt": 1})
                        | Q('range', **
                            {encoded_sample_id + "_ab": {
                                'gte': min_ab
                            }}))
                    #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}}))
                if min_gq:
                    s = s.filter(
                        'range',
                        **{encoded_sample_id + "_gq": {
                            'gte': min_gq
                        }})
                    #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}}))
                if vcf_filter is not None:
                    s = s.filter(~Q('exists', field='filters'))
                    #logger.info("### ADDED FILTER: " + str(~Q('exists', field='filters')))

        # parse variant query
        annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP

        for key, value in query_json.items():
            if key == 'db_tags':
                so_annotations = query_json.get('db_tags', {}).get('$in', [])

                # handle clinvar filters
                selected_so_annotations_set = set(so_annotations)

                all_clinvar_filters_set = set(
                    annotation_groups_map.get("clinvar",
                                              {}).get("children", []))
                selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set

                all_hgmd_filters_set = set(
                    annotation_groups_map.get("hgmd", {}).get("children", []))
                selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set

                vep_consequences = list(selected_so_annotations_set -
                                        selected_clinvar_filters_set -
                                        selected_hgmd_filters_set)
                consequences_filter = Q(
                    "terms", transcriptConsequenceTerms=vep_consequences)

                if selected_clinvar_filters_set:
                    clinvar_clinical_significance_terms = set()
                    for clinvar_filter in selected_clinvar_filters_set:
                        # translate selected filters to the corresponding clinvar clinical consequence terms
                        if clinvar_filter == "pathogenic":
                            clinvar_clinical_significance_terms.update(
                                ["Pathogenic", "Pathogenic/Likely_pathogenic"])
                        elif clinvar_filter == "likely_pathogenic":
                            clinvar_clinical_significance_terms.update([
                                "Likely_pathogenic",
                                "Pathogenic/Likely_pathogenic"
                            ])
                        elif clinvar_filter == "benign":
                            clinvar_clinical_significance_terms.update(
                                ["Benign", "Benign/Likely_benign"])
                        elif clinvar_filter == "likely_benign":
                            clinvar_clinical_significance_terms.update(
                                ["Likely_benign", "Benign/Likely_benign"])
                        elif clinvar_filter == "vus_or_conflicting":
                            clinvar_clinical_significance_terms.update([
                                "Conflicting_interpretations_of_pathogenicity",
                                "Uncertain_significance", "not_provided",
                                "other"
                            ])
                        else:
                            raise ValueError("Unexpected clinvar filter: " +
                                             str(clinvar_filter))

                    consequences_filter = consequences_filter | Q(
                        "terms",
                        clinvar_clinical_significance=list(
                            clinvar_clinical_significance_terms))

                if selected_hgmd_filters_set:
                    hgmd_class = set()
                    for hgmd_filter in selected_hgmd_filters_set:
                        # translate selected filters to the corresponding hgmd clinical consequence terms
                        if hgmd_filter == "disease_causing":
                            hgmd_class.update(["DM"])
                        elif hgmd_filter == "likely_disease_causing":
                            hgmd_class.update(["DM?"])
                        elif hgmd_filter == "hgmd_other":
                            hgmd_class.update(["DP", "DFP", "FP", "FTV"])
                        else:
                            raise ValueError("Unexpected hgmd filter: " +
                                             str(hgmd_filter))

                    consequences_filter = consequences_filter | Q(
                        "terms", hgmd_class=list(hgmd_class))

                if 'intergenic_variant' in vep_consequences:
                    # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy
                    consequences_filter = consequences_filter | ~Q(
                        'exists', field='transcriptConsequenceTerms')

                s = s.filter(consequences_filter)
                #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences))

            if key.startswith("genotypes"):
                sample_id = ".".join(key.split(".")[1:-1])
                encoded_sample_id = _encode_name(sample_id)
                genotype_filter = value
                #logger.info("==> genotype filter: " + str(genotype_filter))
                if type(genotype_filter) == int or type(
                        genotype_filter) == basestring:
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter}))
                    s = s.filter(
                        'term',
                        **{encoded_sample_id + "_num_alt": genotype_filter})

                elif '$gte' in genotype_filter:
                    genotype_filter = {
                        k.replace("$", ""): v
                        for k, v in genotype_filter.items()
                    }
                    s = s.filter(
                        'range',
                        **{encoded_sample_id + "_num_alt": genotype_filter})
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter}))
                elif "$in" in genotype_filter:
                    num_alt_values = genotype_filter['$in']
                    q = Q(
                        'term',
                        **{encoded_sample_id + "_num_alt": num_alt_values[0]})
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_values[0]}))
                    for num_alt_value in num_alt_values[1:]:
                        q = q | Q(
                            'term', **
                            {encoded_sample_id + "_num_alt": num_alt_value})
                        #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_value}))
                    s = s.filter(q)

            if key == "db_gene_ids":
                db_gene_ids = query_json.get('db_gene_ids', {})

                exclude_genes = db_gene_ids.get('$nin', [])
                gene_ids = exclude_genes or db_gene_ids.get('$in', [])

                if exclude_genes:
                    s = s.exclude("terms", geneIds=gene_ids)
                else:
                    s = s.filter("terms", geneIds=gene_ids)
                #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids)))

            if key == "$or" and type(value) == list:
                q_terms = None
                for region_filter in value:
                    xpos_filters = region_filter.get("$and", {})

                    # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}]
                    xpos_filters_dict = {}
                    for xpos_filter in xpos_filters:
                        xpos_filter_setting = xpos_filter[
                            "xpos"]  # for example {'$gte': 12345} or {'$lte': 54321}
                        xpos_filters_dict.update(xpos_filter_setting)

                    xpos_filter_setting = {
                        k.replace("$", ""): v
                        for k, v in xpos_filters_dict.items()
                    }
                    q = Q('range', **{"xpos": xpos_filter_setting})
                    if q_terms is None:
                        q_terms = q
                    else:
                        q_terms |= q
                if q_terms is not None:
                    s = s.filter(q_terms)

                #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting}))

            af_key_map = {
                "db_freqs.AF": "AF",
                "db_freqs.1kg_wgs_phase3": "g1k_POPMAX_AF",
                "db_freqs.exac_v3": "exac_AF_POPMAX",
                "db_freqs.topmed": "topmed_AF",
                "db_freqs.gnomad_exomes": "gnomad_exomes_AF_POPMAX",
                "db_freqs.gnomad_genomes": "gnomad_genomes_AF_POPMAX",
                "db_freqs.gnomad-exomes2": "gnomad_exomes_AF_POPMAX",
                "db_freqs.gnomad-genomes2": "gnomad_genomes_AF_POPMAX",
            }

            if key in af_key_map:
                filter_key = af_key_map[key]
                af_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: af_filter_setting})
                    | ~Q('exists', field=filter_key))
                #logger.info("==> %s: %s" % (filter_key, af_filter_setting))

            ac_key_map = {
                "db_acs.AF": "AC",
                "db_acs.1kg_wgs_phase3": "g1k_AC",
                "db_acs.exac_v3": "exac_AC",
                "db_acs.topmed": "topmed_AC",
                "db_acs.gnomad_exomes": "gnomad_exomes_AC",
                "db_acs.gnomad_genomes": "gnomad_genomes_AC",
                "db_acs.gnomad-exomes2": "gnomad_exomes_AC",
                "db_acs.gnomad-genomes2": "gnomad_genomes_AC",
            }

            if key in ac_key_map:
                filter_key = ac_key_map[key]
                ac_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: ac_filter_setting})
                    | ~Q('exists', field=filter_key))

            hemi_key_map = {
                "db_hemi.exac_v3": "exac_AC_Hemi",
                "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi",
                "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi",
                "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi",
                "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi",
            }

            if key in hemi_key_map:
                filter_key = hemi_key_map[key]
                hemi_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: hemi_filter_setting})
                    | ~Q('exists', field=filter_key))

            hom_key_map = {
                "db_hom.exac_v3": "exac_AC_Hom",
                "db_hom.gnomad_exomes": "gnomad_exomes_Hom",
                "db_hom.gnomad_genomes": "gnomad_genomes_Hom",
                "db_hom.gnomad-exomes2": "gnomad_exomes_Hom",
                "db_hom.gnomad-genomes2": "gnomad_genomes_Hom",
            }

            if key in hom_key_map:
                filter_key = hom_key_map[key]
                hom_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: hom_filter_setting})
                    | ~Q('exists', field=filter_key))

            #s = s.sort("xpos")

        #logger.info("=====")
        #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__))
        #logger.info("FILTERS: " + pformat(s.to_dict()))

        # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
        start = time.time()

        s = s.params(size=max_results_limit + 1)
        #if not include_all_consequences:
        #    s = s.source(exclude=["sortedTranscriptConsequences"])
        response = s.execute()
        logger.info("=====")

        logger.info("TOTAL: %s. Query took %s seconds" %
                    (response.hits.total, time.time() - start))

        if response.hits.total > max_results_limit + 1:
            raise Exception(
                "This search matched too many variants. Please set additional filters and try again."
            )

        #print(pformat(response.to_dict()))

        project = Project.objects.get(project_id=project_id)

        #gene_list_map = project.get_gene_list_map()

        reference = get_reference()

        #for i, hit in enumerate(response.hits):
        variant_results = []
        for i, hit in enumerate(s.scan()):  # preserve_order=True
            #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__)))
            #print("HIT %s: %s" % (i, pformat(hit.to_dict())))
            filters = ",".join(hit["filters"]
                               or []) if "filters" in hit else ""
            genotypes = {}
            all_num_alt = []
            for individual_id in family_individual_ids:
                encoded_individual_id = _encode_name(individual_id)
                num_alt = int(hit["%s_num_alt" % encoded_individual_id]) if (
                    "%s_num_alt" % encoded_individual_id) in hit else -1
                if num_alt is not None:
                    all_num_alt.append(num_alt)

                alleles = []
                if num_alt == 0:
                    alleles = [hit["ref"], hit["ref"]]
                elif num_alt == 1:
                    alleles = [hit["ref"], hit["alt"]]
                elif num_alt == 2:
                    alleles = [hit["alt"], hit["alt"]]
                elif num_alt == -1 or num_alt == None:
                    alleles = []
                else:
                    raise ValueError("Invalid num_alt: " + str(num_alt))

                genotypes[individual_id] = {
                    'ab':
                    hit["%s_ab" % encoded_individual_id] if
                    ("%s_ab" % encoded_individual_id) in hit else None,
                    'alleles':
                    map(str, alleles),
                    'extras': {
                        'ad':
                        hit["%s_ab" % encoded_individual_id] if
                        ("%s_ad" % encoded_individual_id) in hit else None,
                        'dp':
                        hit["%s_dp" % encoded_individual_id] if
                        ("%s_dp" % encoded_individual_id) in hit else None,
                        #'pl': '',
                    },
                    'filter':
                    filters or "pass",
                    'gq':
                    hit["%s_gq" % encoded_individual_id] if
                    ("%s_gq" % encoded_individual_id in hit
                     and hit["%s_gq" % encoded_individual_id] is not None) else
                    '',
                    'num_alt':
                    num_alt,
                }

            if all([num_alt <= 0 for num_alt in all_num_alt]):
                #logger.info("Filtered out due to genotype: " + str(genotypes))
                #print("Filtered all_num_alt <= 0 - Result %s: GRCh38: %s:%s,  cadd: %s  %s - %s" % (i, hit["contig"], hit["start"], hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], all_num_alt))
                continue

            vep_annotation = json.loads(
                str(hit['sortedTranscriptConsequences'])
            ) if 'sortedTranscriptConsequences' in hit else None

            if project.genome_version == GENOME_VERSION_GRCh37:
                grch38_coord = None
                if self.liftover_grch37_to_grch38:
                    grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate(
                        "chr%s" % hit["contig"].replace("chr", ""),
                        int(hit["start"]))
                    if grch38_coord and grch38_coord[0]:
                        grch38_coord = "%s-%s-%s-%s " % (
                            grch38_coord[0][0], grch38_coord[0][1], hit["ref"],
                            hit["alt"])
                    else:
                        grch38_coord = None
            else:
                grch38_coord = hit["variantId"]

            if project.genome_version == GENOME_VERSION_GRCh38:
                grch37_coord = None
                if self.liftover_grch38_to_grch37:
                    grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate(
                        "chr%s" % hit["contig"].replace("chr", ""),
                        int(hit["start"]))
                    if grch37_coord and grch37_coord[0]:
                        grch37_coord = "%s-%s-%s-%s " % (
                            grch37_coord[0][0], grch37_coord[0][1], hit["ref"],
                            hit["alt"])
                    else:
                        grch37_coord = None
            else:
                grch37_coord = hit["variantId"]

            result = {
                #u'_id': ObjectId('596d2207ff66f729285ca588'),
                'alt':
                str(hit["alt"]) if "alt" in hit else None,
                'annotation': {
                    'fathmm':
                    fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0])
                    if "dbnsfp_FATHMM_pred" in hit
                    and hit["dbnsfp_FATHMM_pred"] else None,
                    'muttaster':
                    muttaster_map.get(
                        hit["dbnsfp_MutationTaster_pred"].split(';')[0])
                    if "dbnsfp_MutationTaster_pred" in hit
                    and hit["dbnsfp_MutationTaster_pred"] else None,
                    'polyphen':
                    polyphen_map.get(
                        hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0])
                    if "dbnsfp_Polyphen2_HVAR_pred" in hit
                    and hit["dbnsfp_Polyphen2_HVAR_pred"] else None,
                    'sift':
                    sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0])
                    if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"]
                    else None,
                    'GERP_RS':
                    hit["dbnsfp_GERP_RS"] if "dbnsfp_GERP_RS" in hit else None,
                    'phastCons100way_vertebrate':
                    hit["dbnsfp_phastCons100way_vertebrate"]
                    if "dbnsfp_phastCons100way_vertebrate" in hit else None,
                    'cadd_phred':
                    hit["cadd_PHRED"] if "cadd_PHRED" in hit else None,
                    'dann_score':
                    hit["dbnsfp_DANN_score"]
                    if "dbnsfp_DANN_score" in hit else None,
                    'revel_score':
                    hit["dbnsfp_REVEL_score"]
                    if "dbnsfp_REVEL_score" in hit else None,
                    'eigen_phred':
                    hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else
                    (hit["dbnsfp_Eigen_phred"]
                     if "dbnsfp_Eigen_phred" in hit else None),
                    'mpc_score':
                    hit["mpc_MPC"] if "mpc_MPC" in hit else None,
                    'annotation_tags':
                    list(hit["transcriptConsequenceTerms"] or [])
                    if "transcriptConsequenceTerms" in hit else None,
                    'coding_gene_ids':
                    list(hit['codingGeneIds'] or []),
                    'gene_ids':
                    list(hit['geneIds'] or []),
                    'vep_annotation':
                    vep_annotation,
                    'vep_group':
                    str(hit['mainTranscript_major_consequence'] or ""),
                    'vep_consequence':
                    str(hit['mainTranscript_major_consequence'] or ""),
                    'main_transcript': {
                        k.replace('mainTranscript_', ''): hit[k]
                        for k in dir(hit) if k.startswith('mainTranscript_')
                    },
                    'worst_vep_annotation_index':
                    0,
                    'worst_vep_index_per_gene': {
                        str(hit['mainTranscript_gene_id']): 0
                    },
                },
                'chr':
                hit["contig"],
                'coding_gene_ids':
                list(hit['codingGeneIds'] or []),
                'gene_ids':
                list(hit['geneIds'] or []),
                'coverage': {
                    'gnomad_exome_coverage':
                    float(hit["gnomad_exome_coverage"] or -1)
                    if "gnomad_exome_coverage" in hit else -1,
                    'gnomad_genome_coverage':
                    float(hit["gnomad_genome_coverage"] or -1)
                    if "gnomad_genome_coverage" in hit else -1,
                },
                'pop_counts': {
                    'AC':
                    int(hit['AC'] or 0) if 'AC' in hit else None,
                    'AN':
                    int(hit['AN'] or 0) if 'AN' in hit else None,
                    '1kg_AC':
                    int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None,
                    '1kg_AN':
                    int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None,
                    'exac_v3_AC':
                    int(hit["exac_AC_Adj"] or 0)
                    if "exac_Adj_AC" in hit else None,
                    'exac_v3_Het':
                    int(hit["exac_AC_Het"] or 0)
                    if "exac_AC_Het" in hit else None,
                    'exac_v3_Hom':
                    int(hit["exac_AC_Hom"] or 0)
                    if "exac_AC_Hom" in hit else None,
                    'exac_v3_Hemi':
                    int(hit["exac_AC_Hemi"] or 0)
                    if "exac_AC_Hemi" in hit else None,
                    'gnomad_exomes_AC':
                    int(hit["gnomad_exomes_AC"] or 0)
                    if "gnomad_exomes_AC" in hit else None,
                    'gnomad_exomes_Hom':
                    int(hit["gnomad_exomes_Hom"] or 0)
                    if "gnomad_exomes_Hom" in hit else None,
                    'gnomad_exomes_Hemi':
                    int(hit["gnomad_exomes_Hemi"] or 0)
                    if "gnomad_exomes_Hemi" in hit else None,
                    'gnomad_exomes_AN':
                    int(hit["gnomad_exomes_AN"] or 0)
                    if "gnomad_exomes_AN" in hit else None,
                    'gnomad_genomes_AC':
                    int(hit["gnomad_genomes_AC"] or 0)
                    if "gnomad_genomes_AC" in hit else None,
                    'gnomad_genomes_Hom':
                    int(hit["gnomad_genomes_Hom"] or 0)
                    if "gnomad_genomes_Hom" in hit else None,
                    'gnomad_genomes_Hemi':
                    int(hit["gnomad_genomes_Hemi"] or 0)
                    if "gnomad_genomes_Hemi" in hit else None,
                    'gnomad_genomes_AN':
                    int(hit["gnomad_genomes_AN"] or 0)
                    if "gnomad_genomes_AN" in hit else None,
                    'topmed_AC':
                    float(hit["topmed_AC"] or 0)
                    if "topmed_AC" in hit else None,
                    'topmed_Het':
                    float(hit["topmed_Het"] or 0)
                    if "topmed_Het" in hit else None,
                    'topmed_Hom':
                    float(hit["topmed_Hom"] or 0)
                    if "topmed_Hom" in hit else None,
                    'topmed_AN':
                    float(hit["topmed_AN"] or 0)
                    if "topmed_AN" in hit else None,
                },
                'db_freqs': {
                    'AF':
                    float(hit["AF"] or 0.0) if "AF" in hit else None,
                    '1kg_wgs_AF':
                    float(hit["g1k_AF"] or 0.0) if "g1k_AF" in hit else None,
                    '1kg_wgs_popmax_AF':
                    float(hit["g1k_POPMAX_AF"] or 0.0)
                    if "g1k_POPMAX_AF" in hit else None,
                    'exac_v3_AF':
                    float(hit["exac_AF"] or 0.0) if "exac_AF" in hit else
                    (hit["exac_AC_Adj"] / float(hit["exac_AN_Adj"])
                     if "exac_AC_Adj" in hit and "exac_AN_Adj" in hit
                     and int(hit["exac_AN_Adj"] or 0) > 0 else None),
                    'exac_v3_popmax_AF':
                    float(hit["exac_AF_POPMAX"] or 0.0)
                    if "exac_AF_POPMAX" in hit else None,
                    'gnomad_exomes_AF':
                    float(hit["gnomad_exomes_AF"] or 0.0)
                    if "gnomad_exomes_AF" in hit else None,
                    'gnomad_exomes_popmax_AF':
                    float(hit["gnomad_exomes_AF_POPMAX"] or 0.0)
                    if "gnomad_exomes_AF_POPMAX" in hit else None,
                    'gnomad_genomes_AF':
                    float(hit["gnomad_genomes_AF"] or 0.0)
                    if "gnomad_genomes_AF" in hit else None,
                    'gnomad_genomes_popmax_AF':
                    float(hit["gnomad_genomes_AF_POPMAX"] or 0.0)
                    if "gnomad_genomes_AF_POPMAX" in hit else None,
                    'topmed_AF':
                    float(hit["topmed_AF"] or 0.0)
                    if "topmed_AF" in hit else None,
                },
                #'popmax_populations': {
                #    'exac_popmax': hit["exac_POPMAX"] or None,
                #    'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None,
                #    'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None,
                #},
                'db_gene_ids':
                list((hit["geneIds"] or []) if "geneIds" in hit else []),
                'db_tags':
                str(hit["transcriptConsequenceTerms"] or "")
                if "transcriptConsequenceTerms" in hit else None,
                'extras': {
                    'clinvar_variant_id':
                    hit['clinvar_variation_id']
                    if 'clinvar_variation_id' in hit
                    and hit['clinvar_variation_id'] else None,
                    'clinvar_allele_id':
                    hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit
                    and hit['clinvar_allele_id'] else None,
                    'clinvar_clinsig':
                    hit['clinvar_clinical_significance'].lower() if
                    ('clinvar_clinical_significance' in hit)
                    and hit['clinvar_clinical_significance'] else None,
                    'hgmd_class':
                    hit['hgmd_class'] if 'hgmd_class' in hit and user
                    and user.is_staff else None,
                    'hgmd_accession':
                    hit['hgmd_accession'] if 'hgmd_accession' in hit else None,
                    'genome_version':
                    project.genome_version,
                    'grch37_coords':
                    grch37_coord,
                    'grch38_coords':
                    grch38_coord,
                    'alt_allele_pos':
                    0,
                    'orig_alt_alleles':
                    map(str,
                        [a.split("-")[-1] for a in hit["originalAltAlleles"]])
                    if "originalAltAlleles" in hit else None
                },
                'genotypes':
                genotypes,
                'pos':
                long(hit['start']),
                'pos_end':
                str(hit['end']),
                'ref':
                str(hit['ref']),
                'vartype':
                'snp' if len(hit['ref']) == len(hit['alt']) else "indel",
                'vcf_id':
                None,
                'xpos':
                long(hit["xpos"]),
                'xposx':
                long(hit["xpos"]),
            }

            result["annotation"]["freqs"] = result["db_freqs"]
            result["annotation"]["pop_counts"] = result["pop_counts"]
            result["annotation"]["db"] = "elasticsearch"

            result["extras"][
                "svlen"] = hit["SVLEN"] if "SVLEN" in hit else None
            result["extras"][
                "svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None

            logger.info(
                "Result %s: GRCh37: %s GRCh38: %s:,  cadd: %s  %s - gene ids: %s, coding gene_ids: %s"
                % (i, grch37_coord, grch38_coord,
                   hit["cadd_PHRED"] if "cadd_PHRED" in hit else "",
                   hit["transcriptConsequenceTerms"], result["gene_ids"],
                   result["coding_gene_ids"]))

            result["extras"]["project_id"] = project_id
            result["extras"]["family_id"] = family_id

            # add gene info
            gene_names = {}
            if vep_annotation is not None:
                gene_names = {
                    vep_anno["gene_id"]: vep_anno.get("gene_symbol")
                    for vep_anno in vep_annotation
                    if vep_anno.get("gene_symbol")
                }
            result["extras"]["gene_names"] = gene_names

            try:
                genes = {}
                for gene_id in result["coding_gene_ids"]:
                    if gene_id:
                        genes[gene_id] = reference.get_gene_summary(
                            gene_id) or {}

                if not genes:
                    for gene_id in result["gene_ids"]:
                        if gene_id:
                            genes[gene_id] = reference.get_gene_summary(
                                gene_id) or {}

                #if not genes:
                #    genes =  {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation}

                result["extras"]["genes"] = genes
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                logger.warn(
                    "WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s"
                    % (e, exc_tb.tb_lineno))

            variant_results.append(result)

        logger.info("Finished returning the %s variants: %s seconds" %
                    (response.hits.total, time.time() - start))

        if self._redis_client:
            self._redis_client.set(cache_key, json.dumps(variant_results))

        return [
            Variant.fromJSON(variant_json) for variant_json in variant_results
        ]
Example #28
0
#     es.indices.delete(raw_index_name)

# subprocess.run(['p2o.py', '--enrich', '--index', raw_index_name,
#       '--index-enrich', enrich_index_name, '-e', 'http://localhost:9200/',
#       '--no_inc', '--debug', 'github', 'grimoirelab' , 'perceval',
#       '-t', github_token, '--sleep-for-rate'])

response = es.search(index=enrich_index_name)
Number_of_commits = response['hits']['total']
print("Total Number of commits :- %s" % (Number_of_commits))
response = es.search(index=enrich_index_name, body={"size": Number_of_commits})

# for i in response['hits']['hits']:
#     pprint(i['_source'])

request = elasticsearch_dsl.Search(using=es, index=enrich_index_name)
request = request.source([
    'created_at', 'closed_at', 'time_open_days', 'time_to_close_days',
    'item_type', 'id_in_repo'
])
request = request.filter("terms", item_type=['issue'])
request = request.filter('range', created_at={'gte': 'now-6M'})
request = request.sort({'created_at': {'order': 'asc'}})
request = request[0:10000]
result = request.execute()

# pprint(request.to_dict())
# pprint(result.to_dict())

result = result.to_dict()
data = []
import elasticsearch
import elasticsearch_dsl
# get the last commits

es = elasticsearch.Elasticsearch(['http://localhost:9200/'])

# Build a DSL search object on the `commits` index, `summary` document type
request = elasticsearch_dsl.Search(using=es,
                                   index='commits',
                                   doc_type='summary')
request = request.sort('-commit_date')
request = request.source(['hash', 'author_date', 'author'])
request = request[0:20]

# run the search, using the scan interface to get all results
response = request.execute()
# instead of `scan()` we use `execute()` which allows for slicing, and preserves order.
for commit in response:
    print(commit.hash, commit.author_date, commit.author)
Example #30
0
# Import config
with open('./config/config.yml', 'r') as ymlconfig:
    theconfig = yaml.load(ymlconfig)

# Setup timer for loop
starttime = time.time()

# Elastic, parameters should be passed in from config
eshttpauth = (theconfig['esusername'] + ':' + theconfig['espassword'])
es = elasticsearch.Elasticsearch(hosts=[theconfig['esinstance']],
                                 http_auth=eshttpauth,
                                 timeout=10,
                                 max_retries=3,
                                 retry_on_timeout=True)
s = elasticsearch_dsl.Search(using=es, index=theconfig['esindexsearch'])
#q = Q('bool', must=[Q('range', **{'@timestamp': {'gte': "now-" + theconfig['essearchwindow']}}) & ('match', clientid=theconfig['esclientid'])])
if theconfig['esclientid'] == '':
    s = s.filter(
        'range',
        **{'@timestamp': {
            'gte': "now-" + theconfig['essearchwindow']
        }})
else:
    s = s.filter(
        'range', **{
            '@timestamp': {
                'gte': "now-" + theconfig['essearchwindow']
            }
        }).filter('match', clientid=theconfig['esclientid'])
s = s[0:0]