コード例 #1
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        filters = dict([(k, v) for k, v in request.GET.items()
                        if k in filterable])
        search_filters = self._build_filters(filters=filters)
        if search_filters is not None:
            q = ElasticQuery.filtered(Query.match_all(), search_filters)
        else:
            q = ElasticQuery(Query.match_all())
        s = Search(search_query=q,
                   idx=getattr(view, 'idx'),
                   size=q_size,
                   search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
コード例 #2
0
ファイル: criteria.py プロジェクト: D-I-L/django-criteria
    def get_elastic_query(cls, section=None, config=None):
        ''' function to build the elastic query object
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @return: L{Query}
        '''
        section_config = config[section]
        source_fields = []

        if 'source_fields' in section_config:
            source_fields_str = section_config['source_fields']
            source_fields = source_fields_str.split(',')

        if 'mhc' in section:
            seqid = '6'
            start_range = 25000000
            end_range = 35000000

            seqid_param = section_config['seqid_param']
            start_param = section_config['start_param']
            end_param = section_config['end_param']

        if section == 'is_gene_in_mhc':
            # for region you should make a different query
            # Defined MHC region as chr6:25,000,000..35,000,000

            query = ElasticUtils.range_overlap_query(seqid, start_range, end_range,
                                                     source_fields,
                                                     seqid_param,
                                                     start_param,
                                                     end_param)
        elif section == 'is_marker_in_mhc':
            query_bool = BoolQuery()
            query_bool.must(RangeQuery("start", lte=end_range)) \
                      .must(RangeQuery("start", gte=start_range)) \
                      .must(Query.term("seqid", seqid))
            query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elif section == 'is_region_in_mhc':
            query = ElasticQuery(Query.term("region_name", "MHC"))
        elif section == 'marker_is_gwas_significant_in_ic':
            # build a range query
            gw_sig_p = 0.00000005
            query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p))
        else:
            if len(source_fields) > 0:
                query = ElasticQuery(Query.match_all(), sources=source_fields)
            else:
                # query = ElasticQuery(Query.match_all())
                return None

        return query
コード例 #3
0
    def test_region_attributes(self):
        ''' test region attributes '''
        idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
        newRegion = utils.Region.pad_region_doc(docs[0])

        if len(getattr(newRegion, "genes")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "genes")))
            resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'),
                                  size=len(getattr(newRegion, "genes"))).search()
            self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total,
                             "All genes on region found in GENE index")

        if len(getattr(newRegion, "studies")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "studies")))
            resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'),
                                  size=len(getattr(newRegion, "studies"))).search()
            self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total,
                             "All study ids for region found in STUDY index")

        if len(getattr(newRegion, "pmids")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "pmids")))
            resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'),
                                  size=len(getattr(newRegion, "pmids"))).search()
            self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total,
                             "All PMIDs for region found in PUBLICATION index")
コード例 #4
0
 def test_error(self):
     score_function = ScoreFunction.create_score_function('field_value_factor', field='start')
     self.assertRaises(QueryError, FunctionScoreQuery, 'test_not_query', [score_function])
     self.assertRaises(QueryError, FunctionScoreQuery, Query.match_all(), ['test_not_function_score'])
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'blah')
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', random_scoress='val')
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', field=10)
コード例 #5
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        newgene_ids = {}
        discountinued_geneids = []

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            for doc in docs:
                geneid = getattr(doc, 'geneid')
                discontinued_geneid = getattr(doc, 'discontinued_geneid')
                if geneid is None:
                    discountinued_geneids.append(str(discontinued_geneid))
                else:
                    newgene_ids[str(discontinued_geneid)] = str(geneid)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'],
                                      call_fun=process_hits, query=query)

        return (newgene_ids, discountinued_geneids)
コード例 #6
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def gene_mgi_parse(cls, gene_pubs, idx):
        ''' Parse Ensembl and MGI data from JAX. '''
        orthogenes_mgi = {}
        for gene_mgi in gene_pubs:
            parts = gene_mgi.split('\t')
            if 'MGI:' not in parts[0]:
                raise PipelineError('MGI not found '+parts[0])
            if 'ENSMUSG' not in parts[5]:
                raise PipelineError('ENSMUSG not found '+parts[5])
            orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '')

        orthogene_keys = list(orthogenes_mgi.keys())
        chunk_size = 450
        for i in range(0, len(orthogene_keys), chunk_size):
            chunk_gene_keys = orthogene_keys[i:i+chunk_size]
            json_data = ''
            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl",
                                                                       chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc.doc_id()
                idx_type = doc.type()
                mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus']
                mm['MGI'] = orthogenes_mgi[mm['ensembl']]
                dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}}
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': dbxrefs}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
コード例 #7
0
    def test_bulk(self):
        ''' Test the Bulk.load(). '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']

        json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \
                    (idx, 'marker')
        json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".",
                                 "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"})
        resp = Bulk.load(idx, '', json_data)
        self.assertNotEquals(resp.status_code, 200)

        # note: needs a trailing line return to work
        Bulk.load(idx, '', json_data + '\n')
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1+1, "contains documents")

        # produce errors updating doc id that doesn't exist
        json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"doc": {"start": 100, "end": 200}}\n'
        resp = Bulk.load(idx, '', json_data)
        self.assertTrue('errors' in resp.json() and resp.json()['errors'])
コード例 #8
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        print(filterable)
        print(request)
        filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])
        criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA'))

        idx = criteria_idx
        if type(criteria_idx) == list:
            idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx)
        else:
            idx = ElasticSettings.idx(criteria_idx)

        q = ElasticQuery(Query.match_all())
        s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            new_obj.criteria_type = result['_type']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
コード例 #9
0
 def test_missing_terms_filtered_query(self):
     ''' Test filtered query with a missing terms filter. '''
     terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name")
     query = ElasticQuery.filtered(Query.match_all(), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
コード例 #10
0
    def get_rdm_feature_id(cls, idx, idx_type, qbool=Query.match_all(), sources=[], field=None):
        ''' Get a random feature id from the indices. '''
        doc = cls.get_rdm_docs(idx, idx_type, qbool=qbool, sources=sources, size=1)[0]

        if field is not None:
            return getattr(doc, field)

        return doc.doc_id()
コード例 #11
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section):
        ''' Get an ensembl:entrez id dictionary. '''
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs
        return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
コード例 #12
0
 def test_doc(self):
     ''' Test return correct type of FeatureDocument. '''
     idx = PydginTestSettings.IDX['GENE']['indexName']
     idx_type = PydginTestSettings.IDX['GENE']['indexType']
     res = Search(search_query=ElasticQuery(Query.match_all(), sources=['symbol']),
                  idx=idx, idx_type=idx_type, size=2).search()
     for doc in res.docs:
         self.assertTrue(isinstance(doc, GeneDocument))
コード例 #13
0
    def test_hit_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''

        for idx_type_key in RegionDataTest.IDX_TYPE_KEYS:
            idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, idx_type_key)
            (idx, idx_type) = idx.split('/')

            docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
コード例 #14
0
        def check_hits(resp_json):
            rsids = {}
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc in docs:
                rsid = getattr(doc, "id")
                if rsid is not None:
                    rsids[rsid] = doc
            rsids_keys = list(rsids.keys())
            terms_filter = TermsFilter.get_terms_filter("id", rsids_keys)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter)
            elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys))
            docs_by_rsid = elastic.search().docs
            for doc in docs_by_rsid:
                info = getattr(doc, "info")
                if 'VC=SNV' not in info:
                    continue
                rsid = getattr(doc, "id")
                ic_doc = rsids[rsid]
                pos1 = getattr(doc, "start")
                pos2 = self._get_highest_build(ic_doc)['position']
                if abs(int(pos1) - int(pos2)) > 1:
                    is_par = getattr(ic_doc, 'is_par')
                    allele_a = getattr(ic_doc, 'allele_a')
                    if is_par is None and not (allele_a == 'D' or allele_a == 'I'):
                        msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') +
                               ' '+str(pos2)+" "+rsid+' '+str(pos1))
#                                ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')'

                        query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')),
                                                      Filter(Query.term("start", pos2)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")"

                        query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " (rshigh:"+str(getattr(d, "rshigh")) + \
                                       " build_id:"+str(getattr(d, "build_id"))+")"

                        logger.error(msg)
コード例 #15
0
 def test_doc2(self):
     ''' Test return correct type of FeatureDocument using multiple index search. '''
     idx = PydginTestSettings.IDX['GENE']['indexName'] + ',' + PydginTestSettings.IDX['DISEASE']['indexName']
     res = Search(search_query=ElasticQuery(Query.match_all(), sources=['symbol', 'code']),
                  idx=idx, size=40).search()
     for doc in res.docs:
         self.assertTrue(isinstance(doc, GeneDocument) or isinstance(doc, DiseaseDocument))
         if isinstance(doc, DiseaseDocument):
             self.assertTrue(hasattr(doc, 'code'))
コード例 #16
0
 def test_bool_filtered_query(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)],
                            should_arr=[RangeQuery("start", gte=10050)])
     query_bool.must([Query.term("id", "rs768019142")]) \
               .should(RangeQuery("start", gte=10054))
     query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
コード例 #17
0
 def test_sort_query(self):
     ''' Test sorting for a query. '''
     query = ElasticQuery(Query.match_all())
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score'))
     self._check_sort_order(elastic.search().docs)
     qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]})
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort)
     self._check_sort_order(elastic.search().docs)
     self.assertRaises(QueryError, Sort, 1)
コード例 #18
0
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs
        return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
コード例 #19
0
    def test_gene_attributes(self):
        '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query'''
        idx_key = 'GENE'
        idx_type_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        docs_by_geneid = DataIntegrityUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        # "_source":{"symbol": "RP11-376M2.2", "start": 42975689, "biotype": "sense_intronic", "chromosome": "17",
        # "source": "havana", "strand": "-", "stop": 42977275}
        for doc in docs_by_geneid:
            gene_id_pipeline = doc.doc_id()
            index_pipeline = doc.index()
            start_pipeline = getattr(doc, "start")
            stop_pipeline = getattr(doc, "stop")
            chromosome_pipeline = getattr(doc, "chromosome")

            biotype_pipeline = getattr(doc, "biotype")
            strand_pipeline = getattr(doc, "strand")
            strand_pipeline = -1 if strand_pipeline == '-' else 1
            symbol_pipeline = getattr(doc, "symbol")
            source_pipeline = getattr(doc, "source")

            # genes_hg38_v0.0.2
            pattern = re.compile('genes_\w\w(\d+)', re.IGNORECASE)
            match = pattern.match(index_pipeline)
            assembly_number_pipeline = None
            if match:
                assembly_number_pipeline = match.group(1)

            ensembl_gene_data = DataIntegrityUtils.fetch_from_ensembl(gene_id_pipeline)

            if ensembl_gene_data:
                pattern = re.compile('GRCh(\d+)', re.IGNORECASE)
                match = pattern.match(ensembl_gene_data['assembly_name'])

                assembly_number_ens = None
                if match:
                    assembly_number_ens = match.group(1)

                self.assertEqual(assembly_number_pipeline, assembly_number_ens, "Assembly number is ok")
                self.assertEqual(gene_id_pipeline, ensembl_gene_data['id'], "Gene Id number is ok")
                self.assertEqual(start_pipeline, ensembl_gene_data['start'], "start is ok")
                self.assertEqual(stop_pipeline, ensembl_gene_data['end'], "stop is ok")
                self.assertEqual(chromosome_pipeline, ensembl_gene_data['seq_region_name'], "chr is ok")
                self.assertEqual(strand_pipeline, ensembl_gene_data['strand'], "strand is ok")

                self.assertEqual(biotype_pipeline, ensembl_gene_data['biotype'], "biotype is ok")
                self.assertEqual(symbol_pipeline, ensembl_gene_data['display_name'], "symbol/display_name is ok")
                self.assertEqual(source_pipeline, ensembl_gene_data['source'], "source is ok")
            else:
                logger.warn("No test run....no ensembl data via ensembl webservice")
コード例 #20
0
ファイル: document.py プロジェクト: D-I-L/pydgin
 def get_studies(cls, study_ids=None, disease_code=None, sources=[], split_name=True):
     studies_query = ElasticQuery(Query.match_all(), sources=sources)
     if disease_code is not None:
         studies_query = ElasticQuery(BoolQuery(must_arr=Query.term("diseases", disease_code)), sources=sources)
     elif study_ids:
         studies_query = ElasticQuery(Query.ids(study_ids), sources=sources)
     studies = Search(studies_query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=200).search().docs
     for doc in studies:
         if split_name and getattr(doc, 'study_name') is not None:
             setattr(doc, 'study_name', getattr(doc, 'study_name').split(':', 1)[0])
     return Document.sorted_alphanum(studies, "study_id")
コード例 #21
0
    def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1):
        ''' Get a random doc from the indices. '''
        score_function1 = ScoreFunction.create_score_function('random_score', seed=random.randint(0, 1000000))

        search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'),
                                    sources=sources)
        elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type)
        try:
            return elastic.search().docs
        except IndexError:
            return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
コード例 #22
0
    def docs_by_query(cls, idx, idx_type='', query=Query.match_all()):
        ''' Delete all documents specified by a Query. '''
        def delete_docs(resp_json):
            hits = resp_json['hits']['hits']
            json_data = ''
            for hit in hits:
                json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                             (hit['_index'], hit['_type'], hit['_id'])
            Bulk.load(idx, idx_type, json_data)

        query = ElasticQuery(query, sources='_id')
        ScanAndScroll.scan_and_scroll(idx, idx_type=idx_type, call_fun=delete_docs, query=query)
コード例 #23
0
    def get_rdm_feature_ids(cls, idx, idx_type, qbool=Query.match_all(), sources=[], field=None, size=1):
        ''' Get random feature_ids from the indices. '''
        docs = cls.get_rdm_docs(idx, idx_type, qbool=qbool, sources=sources, size=size)

        ids = []
        for doc in docs:
            if field is not None:
                ids.append(getattr(doc, field))
            else:
                ids.append(doc.doc_id())

        return ids
コード例 #24
0
    def test_bool_nested_filter(self):
        ''' Test combined Bool filter '''
        query_bool_nest = BoolQuery()
        query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \
                       .must(Query.term("seqid", 1))

        query_bool = BoolQuery()
        query_bool.should(query_bool_nest) \
                  .should(Query.term("seqid", 2))
        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
コード例 #25
0
    def test_bool_filtered_query4(self):
        ''' Test building and running a filtered boolean query.
        Note: ElasticQuery used to wrap match in a query object. '''
        query_bool = BoolQuery()
        query_bool.should(RangeQuery("start", lte=20000)) \
                  .should(Query.term("seqid", 2)) \
                  .must(Query.match("id", "rs768019142").query_wrap()) \
                  .must(Query.term("seqid", 1))

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
コード例 #26
0
ファイル: utils.py プロジェクト: tottlefields/pydgin
    def get_site_diseases(cls, tier=None):
        '''
        Returns a list of disease documents separated into main and other based on tier
        @type  tier: integer
        @keyword tier: Tier to filter diseases by (default: None).
        '''
        idx = ElasticSettings.idx('DISEASE', 'DISEASE')

        query = Query.match_all()
        if tier is not None:
            query = FilteredQuery(Query.match_all(), Filter(Query.term("tier", tier)))

        resultObj = Search(search_query=ElasticQuery(query), idx=idx, qsort=Sort('code:asc')).search()

        main = []
        other = []
        for doc in resultObj.docs:
            if getattr(doc, "tier") == 0:
                main.append(doc)
            elif getattr(doc, "tier") == 1:
                other.append(doc)

        return (main, other)
コード例 #27
0
ファイル: views.py プロジェクト: D-I-L/pydgin
    def post(self, request, *args, **kwargs):
        ens_id = self.request.POST.get('ens_id')
        marker = self.request.POST.get('marker')
        markers = self.request.POST.getlist('markers[]')

        if ens_id:
            sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
        elif marker:
            sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap())
        elif markers:
            sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap())

        query = ElasticQuery.filtered(Query.match_all(), sfilter)
        elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500)
        study_hits = elastic.get_json_response()['hits']

        ens_ids = []
        pmids = []
        for hit in study_hits['hits']:
            if 'pmid' in hit['_source']:
                pmids.append(hit['_source']['pmid'])
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    ens_ids.append(ens_id)
        docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
        pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal'])

        for hit in study_hits['hits']:
            genes = {}
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    try:
                        genes[ens_id] = getattr(docs[ens_id], 'symbol')
                    except KeyError:
                        genes = {ens_id: ens_id}
            hit['_source']['genes'] = genes
            if 'pmid' in hit['_source']:
                pmid = hit['_source']['pmid']
                try:
                    authors = getattr(pub_docs[pmid], 'authors')
                    journal = getattr(pub_docs[pmid], 'journal')
                    hit['_source']['pmid'] = \
                        {'pmid': pmid,
                         'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "",
                         'journal': journal}
                except KeyError:
                    hit['_source']['pmid'] = {'pmid': pmid}

        return JsonResponse(study_hits)
コード例 #28
0
ファイル: utils.py プロジェクト: ollyburren/django-elastic
 def get_rdm_feature_id(cls,
                        idx,
                        idx_type,
                        qbool=Query.match_all(),
                        sources=[],
                        field=None):
     ''' Get a random feature id from the indices. '''
     doc = cls.get_rdm_docs(idx,
                            idx_type,
                            qbool=qbool,
                            sources=sources,
                            size=1)[0]
     if field is not None:
         return getattr(doc, field)
     return doc.doc_id()
コード例 #29
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        lookup = {}

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs})

        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])
        ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery)
        return lookup
コード例 #30
0
    def _check_gene_history(cls, gene_sets, section):
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets))
        docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')

            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)

        return (newgene_ids, discountinued_geneids)
コード例 #31
0
    def test_delete_docs_by_query(self):
        ''' Test deleting docs using a query. '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']
        self.assertGreater(hits_total1, 0, "contains documents")

        # delete single doc
        Delete.docs_by_query(idx, query=Query.term("id", "rs2476601"))
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1-1, "contains documents")

        # delete remaining docs
        Delete.docs_by_query(idx, 'marker')
        Search.index_refresh(idx)
        self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
コード例 #32
0
    def test_doc_auth(self):
        idx = PydginTestSettings.IDX["STUDY_HITS"]["indexName"]
        docs = Search(ElasticQuery(Query.match_all(), sources=["chr_band", "marker"]), idx=idx, size=1).search().docs
        self.assertEquals(len(docs), 1, "STUDY_HITS document")
        marker_id = getattr(docs[0], "marker")

        url = reverse("search_page")
        resp = self.client.post(url + "?idx=ALL&query=" + marker_id)
        nhits1 = resp.context["hits_total"]
        self.assertGreater(nhits1, 0, "search hits > 0")
        # update document to be in DIL
        update_field = {"doc": {"group_name": "DIL"}}
        Update.update_doc(docs[0], update_field)
        Search.index_refresh(PydginTestSettings.IDX["STUDY_HITS"]["indexName"])

        url = reverse("search_page")
        resp = self.client.post(url + "?idx=ALL&query=" + marker_id)
        nhits2 = resp.context["hits_total"]
        self.assertEqual(nhits1 - 1, nhits2, "private document hidden")
コード例 #33
0
ファイル: tests_search_engine.py プロジェクト: D-I-L/pydgin
    def test_doc_auth(self):
        ''' Test private documents are not returned in the search. '''
        idx = PydginTestSettings.IDX['MARKER']['indexName']
        docs = Search(ElasticQuery(Query.match_all(), sources=['id']), idx=idx, size=1).search().docs
        self.assertEquals(len(docs), 1, "MARKER document")
        marker_id = getattr(docs[0], 'id')

        url = reverse('search_page')
        resp = self.client.post(url+'?idx=ALL&query='+marker_id)
        nhits1 = resp.context['hits_total']
        self.assertGreater(nhits1, 0, 'search hits > 0')
        # update document to be in DIL
        update_field = {"doc": {"group_name": "DIL"}}
        Update.update_doc(docs[0], update_field)
        Search.index_refresh(PydginTestSettings.IDX['MARKER']['indexName'])

        resp = self.client.post(url+'?idx=ALL&query='+marker_id)
        nhits2 = resp.context['hits_total']
        self.assertEqual(nhits1-1, nhits2, 'private document hidden')
コード例 #34
0
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        docs = Search(query, idx=section['index'], idx_type=section['index_type'],
                      size=len(gene_sets)).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')
            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)
        return (newgene_ids, discountinued_geneids)
コード例 #35
0
ファイル: test_regions.py プロジェクト: tottlefields/pydgin
    def test_pad_region(self):
        ''' Test the padding of a region based on it's disease_loci & hits. '''
        idx = ElasticSettings.idx(RegionTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)

        region = docs[0]
        self.assertFalse(getattr(region, "build_info"), "Region doesn't contain any positional details")
        self.assertFalse(getattr(region, "markers"), "Region doesn't contain any marker details")
        self.assertFalse(getattr(region, "hits"), "Region doesn't contain any HIT details")
        self.assertFalse(getattr(region, "genes"), "Region doesn't contain any gene details")
        self.assertFalse(getattr(region, "studies"), "Region doesn't contain any study details")
        self.assertFalse(getattr(region, "pmids"), "Region doesn't contain any publication details")

        newRegion = utils.Region.pad_region_doc(region)
        self.assertTrue(getattr(newRegion, "build_info"), "New region contains positional details")
        self.assertTrue(getattr(newRegion, "markers"), "New region contains marker details")
        self.assertGreaterEqual(len(getattr(newRegion, "markers")), 1, "New region contains at least 1 marker")
        self.assertTrue(getattr(newRegion, "hits"), "New region contains hit details")
        self.assertGreaterEqual(len(getattr(newRegion, "hits")), 1, "New region contains at least 1 HIT")
コード例 #36
0
ファイル: utils.py プロジェクト: ollyburren/django-elastic
 def get_rdm_feature_ids(cls,
                         idx,
                         idx_type,
                         qbool=Query.match_all(),
                         sources=[],
                         field=None,
                         size=1):
     ''' Get random feature_ids from the indices. '''
     docs = cls.get_rdm_docs(idx,
                             idx_type,
                             qbool=qbool,
                             sources=sources,
                             size=size)
     ids = []
     for doc in docs:
         if field is not None:
             ids.append(getattr(doc, field))
         else:
             ids.append(doc.doc_id())
     return ids
コード例 #37
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _update_gene(cls, genes, idx):
        ''' Use genes data to update the index. '''
        gene_keys = list(genes.keys())
        chunk_size = 450
        for i in range(0, len(genes), chunk_size):
            chunk_gene_keys = gene_keys[i:i+chunk_size]
            json_data = ''

            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc._meta['_id']
                idx_type = doc.type()
                entrez = getattr(doc, 'dbxrefs')['entrez']
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': genes[entrez]}) + '\n'
            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
コード例 #38
0
ファイル: views.py プロジェクト: tottlefields/pydgin
def studies_details(request):
    """ Get studies for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), sfilter)
    elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500)
    study_hits = elastic.get_json_response()["hits"]

    ens_ids = []
    pmids = []
    for hit in study_hits["hits"]:
        if "pmid" in hit["_source"]:
            pmids.append(hit["_source"]["pmid"])
        for ens_id in hit["_source"]["genes"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])
    pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"])

    for hit in study_hits["hits"]:
        genes = {}
        for ens_id in hit["_source"]["genes"]:
            try:
                genes[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genes = {ens_id: ens_id}
        hit["_source"]["genes"] = genes
        if "pmid" in hit["_source"]:
            pmid = hit["_source"]["pmid"]
            try:
                authors = getattr(pub_docs[pmid], "authors")
                journal = getattr(pub_docs[pmid], "journal")
                hit["_source"]["pmid"] = {
                    "pmid": pmid,
                    "author": authors[0]["name"].rsplit(None, 1)[-1],
                    "journal": journal,
                }
            except KeyError:
                hit["_source"]["pmid"] = {"pmid": pmid}

    return JsonResponse(study_hits)
コード例 #39
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
    def get_new_pmids(cls, pmids, idx, disease_code=None):
        ''' Find PMIDs in a list that are not in the elastic index. '''
        chunk_size = 800
        pmids_found = set()
        pmids_found_add = pmids_found.add
        time.sleep(5)

        for i in range(0, len(pmids), chunk_size):
            pmids_slice = pmids[i:i+chunk_size]
            terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags'])

            docs = Search(query, idx=idx, size=chunk_size).search().docs
            json_data = ''

            for doc in docs:
                pmids_found_add(getattr(doc, 'pmid'))
                if disease_code is not None:
                    tags = getattr(doc, 'tags')
                    if 'disease' in tags:
                        disease = tags['disease']
                    else:
                        disease = []
                    if disease_code not in disease:
                        # update disease attribute
                        disease.append(disease_code)
                        tags['disease'] = disease
                        idx_name = doc._meta['_index']
                        idx_type = doc.type()

                        doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type,
                                               "_index": idx_name, "_retry_on_conflict": 3}}
                        json_data += json.dumps(doc_data) + '\n'
                        json_data += json.dumps({'doc': {'tags': tags}}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx_name, idx_type, json_data)

        return [pmid for pmid in pmids if pmid not in pmids_found]
コード例 #40
0
ファイル: views.py プロジェクト: D-I-L/pydgin
def genesets_details(request):
    ''' Get pathway gene sets for a given ensembl ID. '''
    ens_id = request.POST.get('ens_id')
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500)
    genesets_hits = elastic.get_json_response()['hits']
    ens_ids = []
    for hit in genesets_hits['hits']:
        for ens_id in hit['_source']['gene_sets']:
            ens_ids.append(ens_id)
    docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])

    for hit in genesets_hits['hits']:
        genesets = {}
        for ens_id in hit['_source']['gene_sets']:
            try:
                genesets[ens_id] = getattr(docs[ens_id], 'symbol')
            except KeyError:
                genesets[ens_id] = ens_id
        hit['_source']['gene_sets'] = genesets
    return JsonResponse(genesets_hits)
コード例 #41
0
ファイル: views.py プロジェクト: tottlefields/pydgin
def genesets_details(request):
    """ Get pathway gene sets for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500)
    genesets_hits = elastic.get_json_response()["hits"]
    ens_ids = []
    for hit in genesets_hits["hits"]:
        for ens_id in hit["_source"]["gene_sets"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])

    for hit in genesets_hits["hits"]:
        genesets = {}
        for ens_id in hit["_source"]["gene_sets"]:
            try:
                genesets[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genesets[ens_id] = ens_id
        hit["_source"]["gene_sets"] = genesets
    return JsonResponse(genesets_hits)
コード例 #42
0
    def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True):
        '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs'''

        # first check in gene_history
        (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section)

        # replace all old ids with new ids
        replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets))
        docs = Search(query, idx=section['index'], size=1000000).search().docs
        ensembl_ids = []
        for doc in docs:
            ens_id = doc._meta['_id']
            ensembl_ids.append(ens_id)

        if log_conversion:
            if log_output_file_handler is not None:
                cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler)

        return ensembl_ids
コード例 #43
0
ファイル: utils.py プロジェクト: ollyburren/django-elastic
    def get_rdm_docs(cls,
                     idx,
                     idx_type,
                     qbool=Query.match_all(),
                     sources=[],
                     size=1):
        ''' Get a random doc from the indices. '''
        score_function1 = ScoreFunction.create_score_function(
            'random_score', seed=random.randint(0, 1000000))

        search_query = ElasticQuery(FunctionScoreQuery(qbool,
                                                       [score_function1],
                                                       boost_mode='replace'),
                                    sources=sources)
        elastic = Search(search_query=search_query,
                         size=size,
                         idx=idx,
                         idx_type=idx_type)
        try:
            return elastic.search().docs
        except IndexError:
            return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
コード例 #44
0
def chicpeaSearch(request, url):
    queryDict = request.GET
    user = request.user
    targetIdx = queryDict.get("targetIdx")
    blueprint = {}
    hic = []
    addList = []
    searchType = 'gene'
    searchTerm = queryDict.get("searchTerm").upper()
    searchTerm = searchTerm.replace(",", "")
    searchTerm = searchTerm.replace("..", "-")
    snpTrack = queryDict.get("snp_track")

    (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types(
                                            user=user, idx_keys=None, idx_type_keys=None)

    if snpTrack:
        mo = re.match(r"(.*)-(.*)", snpTrack)
        (group, track) = mo.group(1, 2)  # @UnusedVariable
        if group != 'ud' and 'CP_STATS_'+group.upper()+'.'+snpTrack.upper() not in idx_type_keys_auth:
            snpTrack = None

    if targetIdx not in utils.tissues:
        for target in getattr(chicp_settings, 'CP_TARGET'):
            if 'CP_TARGET_'+target not in idx_keys_auth:
                if targetIdx == target:
                    retJSON = {'error': 'Sorry, you do not have permission to view this dataset.'}
                    return JsonResponse(retJSON)
                continue
            elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target")
            tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)]
                              ['mappings']['gene_target']['_meta']['tissue_type'].keys())
            utils.tissues['CP_TARGET_'+target] = tissueList

    if queryDict.get("region") or re.match(r"(.*):(\d+)-(\d+)", searchTerm):
        searchType = 'region'
        region = searchTerm
        if queryDict.get("region"):
            region = queryDict.get("region")
        else:
            searchTerm = ""
        mo = re.match(r"(.*):(\d+)-(\d+)", region)
        (chrom, segmin, segmax) = mo.group(1, 2, 3)
        chrom = chrom.replace('chr', "")
        chrom = chrom.replace('CHR', "")
    if re.search("^rs[0-9]+", searchTerm.lower()):
        searchTerm = searchTerm.lower()
        addList.append(_find_snp_position(snpTrack, searchTerm))
        if addList[0].get("error"):
            return JsonResponse({'error': addList[0]['error']})
        position = addList[0]['end']
        if searchType != 'region':
            searchType = 'snp'

    logger.warn("### "+searchType+" - "+searchTerm+' ###')

    if searchType == 'region':
        query_bool = BoolQuery()
        filter_bool = BoolQuery()
        if searchTerm and len(addList) == 0 and re.match(r"(.*):(\d+)-(\d+)",
                                                         queryDict.get("searchTerm").replace(",", "")) == None:
            query_bool.must([Query.query_string(searchTerm, fields=["name", "ensg"]),
                             Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
        else:
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])

        query_bool = _add_tissue_filter(query_bool, targetIdx)

        if len(addList) > 0:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])
        else:
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", gte=segmin, lte=segmax),
                                                    RangeQuery("baitEnd", gte=segmin, lte=segmax)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", gte=segmin, lte=segmax),
                                                    RangeQuery("oeEnd", gte=segmin, lte=segmax)])])

        query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                           sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
        (hic, v1, v2) = _build_hic_query(query, targetIdx, segmin, segmax)  # @UnusedVariable

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': queryDict.get("searchTerm")+' does not overlap any bait/target regions in this dataset.'}
            return JsonResponse(retJSON)

    elif searchType == 'snp':
        if len(addList) > 0:
            chrom = addList[0]['chr']

            query_bool = BoolQuery()
            query_bool.must([Query.term("baitChr", chrom),
                             Query.term("oeChr", chrom),
                             RangeQuery("dist", gte=-2e6, lte=2e6)])
            query_bool = _add_tissue_filter(query_bool, targetIdx)

            filter_bool = BoolQuery()
            filter_bool.should([BoolQuery(must_arr=[RangeQuery("baitStart", lte=position),
                                                    RangeQuery("baitEnd", gte=position)]),
                                BoolQuery(must_arr=[RangeQuery("oeStart", lte=position),
                                                    RangeQuery("oeEnd", gte=position)])])

            query = ElasticQuery.filtered_bool(query_bool, filter_bool,
                                               sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])
            hic, segmin, segmax = _build_hic_query(query, targetIdx)

            if "error" in hic:
                return JsonResponse(hic)
            if len(hic) == 0:
                retJSON = {'error': 'Marker '+searchTerm+' does not overlap any bait/target regions in this dataset.'}
                return JsonResponse(retJSON)
    else:
        # geneQuery = ElasticQuery.query_string(searchTerm, fields=["gene_name"])
        geneQuery = ElasticQuery.filtered(Query.match_all(), Filter(Query.match("gene_name", searchTerm).query_wrap()))
        resultObj = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/',
                           search_query=geneQuery, size=0, qsort=Sort('seqid:asc,start')).search()
        if resultObj.hits_total > 1:
            geneResults = []
            resultObj2 = Search(idx=getattr(chicp_settings, 'CP_GENE_IDX') + '/genes/', search_query=geneQuery,
                                size=(resultObj.hits_total+1), qsort=Sort('seqid:asc,start')).search()

            docs = resultObj2.docs
            gene_ids = [getattr(doc, 'attr')['gene_id'][1:-1] for doc in docs]

            query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter('ensg', gene_ids))
            agg = Agg('ensg_agg', "terms", {"field": "ensg", "size": 0})
            res = Search(idx=ElasticSettings.idx('CP_TARGET_'+targetIdx), search_query=query, aggs=Aggs(agg),
                         size=0).search()

            ensg_count = res.aggs['ensg_agg'].get_buckets()
            gene_ids = [g['key'] for g in ensg_count]

            for d in resultObj2.docs:
                if getattr(d, "attr")["gene_id"].replace('\"', '') in gene_ids:
                    geneResults.append({
                        'gene_name': getattr(d, "attr")["gene_name"].replace('\"', ''),
                        'gene_id': getattr(d, "attr")["gene_id"].replace('\"', ''),
                        'location': "chr" + getattr(d, "seqid") + ":" +
                        locale.format_string("%d", getattr(d, "start"), grouping=True) + ".." +
                        locale.format_string("%d", getattr(d, "end"), grouping=True),
                    })

            if len(geneResults) == 0:
                retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
                return JsonResponse(retJSON)
            elif len(geneResults) > 1:
                retJSON = {
                    'error': 'Gene name <strong>'+searchTerm+'</strong> returns too many hits, please select your prefered result from the list below.',
                    'results': geneResults,
                    'cols': ['HGNC Symbol', 'Ensembl Gene ID', 'Location']
                }
                return JsonResponse(retJSON)

        query_bool = BoolQuery()
        query_bool.must([RangeQuery("dist", gte=-2e6, lte=2e6)])
        query_bool = _add_tissue_filter(query_bool, targetIdx)
        query = ElasticQuery.filtered_bool(Query.query_string(searchTerm, fields=["name", "ensg", "oeName"]),
                                           query_bool, sources=utils.hicFields + utils.tissues['CP_TARGET_'+targetIdx])

        (hic, segmin, segmax) = _build_hic_query(query, targetIdx)

        if "error" in hic:
            return JsonResponse(hic)
        if len(hic) == 0:
            retJSON = {'error': 'Gene name '+searchTerm+' not found in this dataset.'}
            return JsonResponse(retJSON)
        chrom = hic[0]['baitChr']

    try:
        chrom
    except NameError:
        retJSON = {'error': 'No chromosome defined for search'}
        return JsonResponse(retJSON)

    # get genes based on this segment
    genes = _build_gene_query(chrom, segmin, segmax)
    (snps, snpMeta) = _build_snp_query(snpTrack, chrom, segmin, segmax)
    frags = _build_frags_query(getattr(chicp_settings, 'DEFAULT_FRAG'), chrom, segmin, segmax)

    addList = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], addList)

    retJSON = {"hic": hic,
               "frags": frags,
               "meta": {"ostart": int(segmin),
                        "oend": int(segmax),
                        "rstart": 1,
                        "rend": int(segmax) - int(segmin),
                        "rchr": str(chrom),
                        "tissues": utils.tissues['CP_TARGET_'+targetIdx]},
               "snps": snps,
               "snp_meta": snpMeta,
               "genes": genes,
               "region": str(chrom) + ":" + str(segmin) + "-" + str(segmax),
               "blueprint": blueprint,
               "extra": addList
               }

    response = JsonResponse(retJSON)
    return response
コード例 #45
0
def _add_diseases():
    ''' Add diseases dictionary to a context '''
    query = ElasticQuery(Query.match_all())
    elastic_disease = Search(search_query=query, size=100, idx='disease')
    return elastic_disease.get_json_response()['hits']['hits']
コード例 #46
0
    def test_elastic_group_name(self):
        '''
        Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/
        Testing various elastic queries

        idx doc:
         "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"},
         "seqid": "chr4", "source": "immunobase", "type": "region",
         "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373}
        idx_query:
        Private(in given group) OR Public
        -d '{"query":{"filtered":{"filter":{"bool": {
                                            "should": [
                                                        {"terms": {"group_name":["dil"]}},
                                                        { "missing": { "field": "group_name"   }}
                                                      ]
                                                    }}}}}'
        Private(in given group):
        -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}'
        Public:
        -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}},
-                         'query': {'term': {'match_all': '{}'}}}}}
        '''
        # get the groups for the given user
        response = self.client.post('/accounts/login/', {
            'username': '******',
            'password': '******'
        })
        self.assertTrue(response.status_code, "200")

        logged_in_user = User.objects.get(
            id=self.client.session['_auth_user_id'])
        if logged_in_user and logged_in_user.is_authenticated():
            user_groups = get_user_groups(logged_in_user)
            self.assertTrue('READ' in user_groups,
                            "user present in READ group")
            # make sure the user is not yet in DIL group
            self.assertFalse('DIL' in user_groups,
                             "user not present in DIL group")

        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) == 0, "No group present")

        # Match all query, as there is no group we do a match all
        query = ElasticQuery(Query.match_all())
        expected_query_string = {"query": {"match_all": {}}}
        self.assertJSONEqual(json.dumps(query.query),
                             json.dumps(expected_query_string),
                             "Query string matched")

        Search.index_refresh(self.index_name)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved all public regions")

        # Filtered query for group names, add the user to DIL group and get the query string
        self.dil_group = Group.objects.create(name='DIL')
        logged_in_user.groups.add(self.dil_group)
        group_names = get_user_groups(logged_in_user)
        if 'READ' in group_names: group_names.remove('READ')  # @IgnorePep8
        group_names = [x.lower() for x in group_names]
        self.assertTrue(len(group_names) > 0, "More than 1 group present")
        self.assertTrue("dil" in group_names, "DIL group present")

        # retrieves all docs with missing field group_name - 11 docs
        terms_filter = TermsFilter.get_missing_terms_filter(
            "field", "attr.group_name")
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 11,
            "Elastic string query retrieved all public regions")

        # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs
        query_bool = BoolQuery()
        query_bool.should(Query.missing_terms("field", "group_name")) \
                  .should(Query.terms("group_name", group_names).query_wrap())

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 12,
            "Elastic string query retrieved both public + private regions")

        terms_filter = TermsFilter.get_terms_filter("attr.group_name",
                                                    group_names)
        query = ElasticQuery.filtered(Query.match_all(), terms_filter)
        elastic = Search(query, idx=self.index_name)
        docs = elastic.search().docs
        self.assertTrue(
            len(docs) == 1,
            "Elastic string query retrieved one private regions")
        self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region")
        self.assertEqual(docs[0].attr['region_id'], "803",
                         "type matched region")
        self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]",
                         "type matched region")