Esempio n. 1
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        filters = dict([(k, v) for k, v in request.GET.items()
                        if k in filterable])
        search_filters = self._build_filters(filters=filters)
        if search_filters is not None:
            q = ElasticQuery.filtered(Query.match_all(), search_filters)
        else:
            q = ElasticQuery(Query.match_all())
        s = Search(search_query=q,
                   idx=getattr(view, 'idx'),
                   size=q_size,
                   search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
    def get_interaction_doc(self, interaction_source='intact', parent_id=None):

        idx_key = 'GENE'
        idx_type_key = 'INTERACTIONS'
        parent_idx_key = 'GENE'

        idx = ElasticSettings.idx(idx_key, idx_type_key)
        (idx, idx_type) = idx.split('/')

        if parent_id:
            qbool_intact = BoolQuery().must([Query.term("interaction_source", interaction_source),
                                            Query.term("_parent", parent_id)])
        else:
            qbool_intact = BoolQuery().should([Query.term("interaction_source", interaction_source)])

        # Get random doc or specific if id is passed in query
        docs_by_geneid = DataIntegrityUtils.get_rdm_docs(idx, idx_type, qbool=qbool_intact, sources=[], size=1)
        doc = docs_by_geneid[0]

        # Get parent doc
        parent_id = doc.parent()
        parent_docs = DataIntegrityUtils.fetch_from_elastic(idx_key, parent_idx_key, [parent_id])

        if parent_docs:
            self.assertTrue(len(parent_docs) >= 1, "Found 1 parent")
            parent_doc = parent_docs[0]
            return doc, parent_doc
        else:
            return self.get_interaction_doc("intact", parent_id)
Esempio n. 3
0
    def test_region_attributes(self):
        ''' test region attributes '''
        idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION')
        (idx, idx_type) = idx.split('/')
        docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1)
        newRegion = utils.Region.pad_region_doc(docs[0])

        if len(getattr(newRegion, "genes")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "genes")))
            resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'),
                                  size=len(getattr(newRegion, "genes"))).search()
            self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total,
                             "All genes on region found in GENE index")

        if len(getattr(newRegion, "studies")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "studies")))
            resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'),
                                  size=len(getattr(newRegion, "studies"))).search()
            self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total,
                             "All study ids for region found in STUDY index")

        if len(getattr(newRegion, "pmids")) > 0:
            query = ElasticQuery(Query.ids(getattr(newRegion, "pmids")))
            resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'),
                                  size=len(getattr(newRegion, "pmids"))).search()
            self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total,
                             "All PMIDs for region found in PUBLICATION index")
Esempio n. 4
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    if seqid is not None and isinstance(seqid,
                                        str) and seqid.startswith("chr"):
        seqid = seqid
    else:
        seqid = 'chr' + str(seqid)
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("gene_symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", lte=start_pos),
            RangeQuery("featureloc.end", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", gte=start_pos),
            RangeQuery("featureloc.end", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
Esempio n. 5
0
def association_stats(request, sources=None):
    ''' Get association statistics for a given marker ID. '''
    seqid = request.GET.get('chr').replace('chr', '')
    idx_type = request.GET.get('idx_type').upper()
    start = request.GET.get('start')
    end = request.GET.get('end')
    data = []

    def get_stats(resp_json):
        hits = resp_json['hits']['hits']
        for hit in hits:
            d = Document(hit)
            data.append({
                "CHROM": getattr(d, 'seqid'),
                "POS": getattr(d, 'position'),
                "PVALUE": getattr(d, 'p_value'),
                "DBSNP_ID": getattr(d, 'marker')
            })

    query = ElasticQuery(Query.query_string(seqid, fields=["seqid"]), sources=sources)
    if start is not None and end is not None:
        query = ElasticQuery(BoolQuery(must_arr=[Query.query_string(seqid, fields=["seqid"]),
                                                 RangeQuery("position", gte=start, lte=end)]), 
                             sources=sources)
    ScanAndScroll.scan_and_scroll(ElasticSettings.idx('IC_STATS', idx_type), call_fun=get_stats, query=query)

    json = {"variants": data}
    return JsonResponse(json)
Esempio n. 6
0
 def get_hits_by_study_id(cls, study_id, sources=[]):
     ''' Get visible/authenticated hits. '''
     hits_query = ElasticQuery(BoolQuery(must_arr=Query.term('dil_study_id', study_id),
                                         b_filter=Filter(Query.missing_terms("field", "group_name"))),
                               sources=sources)
     docs = Search(hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=1000).search().docs
     ens_ids = [gene for doc in docs if getattr(doc, 'genes') for gene in getattr(doc, 'genes')]
     gene_docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
     for doc in docs:
         if getattr(doc, 'genes'):
             genes = {}
             for ens_id in getattr(doc, 'genes'):
                 try:
                     genes[ens_id] = getattr(gene_docs[ens_id], 'symbol')
                 except KeyError:
                     genes = {ens_id: ens_id}
             setattr(doc, 'genes', genes)
         build_info = getattr(doc, 'build_info')
         for bi in build_info:
             if bi['build'] == settings.DEFAULT_BUILD:
                 setattr(doc, "loc", "chr" + bi['seqid'] + ":" +
                         str(locale.format("%d", bi['start'], grouping=True)) + "-" +
                         str(locale.format("%d", bi['end'], grouping=True)))
                 setattr(doc, "encoded_loc", "chr" + bi['seqid'] + "%3A" +
                         str(bi['start']) + ".." + str(bi['end']))
     return docs
Esempio n. 7
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", lte=start_pos),
            RangeQuery("stop", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", gte=start_pos),
            RangeQuery("stop", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
Esempio n. 8
0
    def fetch_overlapping_features(cls, build, seqid, start, end, idx=None, idx_type=None, disease_id=None):
        ''' function to create fetch overlapping features for a given stretch of region
            the build info is stored as nested document..so nested query is build
        @type  build: string
        @param build: build info eg: 'GRCh38'
        @type  seqid: string
        @param seqid: chromosome number
        @type  start:  string
        @param start: region start
        @type  end:  string
        @param end: region end
        @type  idx: string
        @param idx: name of the index
        @type  idx_type: string
        @param idx_type: name of the idx type, each criteria is an index type
        @type  disease_id:  string
        @param disease_id: disease code
        '''
        nbuild = build
        start_range = start
        end_range = end

        bool_range = BoolQuery()
        bool_range.must(RangeQuery("build_info.start", lte=start_range)) \
                  .must(RangeQuery("build_info.end", gte=end_range))

        or_filter = OrFilter(RangeQuery("build_info.start", gte=start_range, lte=end_range))

        or_filter.extend(RangeQuery("build_info.end", gte=start_range, lte=end_range)) \
                 .extend(bool_range)

        bool_query = BoolQuery()

        if disease_id:
            qnested_buildinfo = Query.nested('build_info', bool_query)
            bool_query = BoolQuery()
            bool_query.must(Query.term("disease", disease_id.lower())).must(qnested_buildinfo)
            qnested = ElasticQuery(bool_query, sources=['build_info.*',
                                                        'disease_locus',
                                                        'disease',
                                                        'chr_band',
                                                        'species'])

        else:
            bool_query.must(Query.term("build_info.build", nbuild)) \
                  .must(Query.term("build_info.seqid", seqid)) \
                  .filter(or_filter)

            qnested = ElasticQuery(Query.nested('build_info', bool_query), sources=['build_info.*',
                                                                                    'disease_locus',
                                                                                    'disease',
                                                                                    'chr_band',
                                                                                    'species'])

        elastic = Search(qnested, idx=idx, idx_type=idx_type)
        res = elastic.search()
        return res.docs
 def test_bool_filtered_query(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)],
                            should_arr=[RangeQuery("start", gte=10050)])
     query_bool.must([Query.term("id", "rs768019142")]) \
               .should(RangeQuery("start", gte=10054))
     query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
 def test_and_filtered_query(self):
     ''' Test building and running a filtered query. '''
     query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)])
     and_filter = AndFilter(query_bool)
     and_filter.extend(RangeQuery("start", gte=1)) \
               .extend(Query.term("seqid", 1))
     query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
Esempio n. 11
0
 def test_url_rotate(self):
     ''' Test the url rotates from http://xxx:9200 to correct url. '''
     query = ElasticQuery.filtered(Query.term("seqid", 1),
                                   Filter(Query.term("id", "rs768019142")))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1,
                     "Elastic filtered query retrieved marker")
     Search.index_exists('test', 'test2')
     ElasticUrl.URL_INDEX = 0  # reset
    def test_term_query(self):
        ''' Test building and running a match query. '''
        query = ElasticQuery(Query.term("id", "rs2476601"))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)")

        query = ElasticQuery(Query.term("seqid", "1", boost=3.0))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers  on chr1")
 def test_bool_filtered_query2(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery()
     query_bool.should(RangeQuery("start", lte=20000)) \
               .should(Query.term("seqid", 2)) \
               .must(Query.term("seqid", 1))
     query_string = Query.query_string("rs768019142", fields=["id", "seqid"])
     query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
Esempio n. 14
0
def _auth_arr(user):
    ''' Get authentication array for BoolQuery for retrieving public and
    authenticated documents.  '''
    auth_arr = [Query.missing_terms("field", "group_name")]  # all public documents
    try:
        auth_arr.append(Query.terms("group_name",  # all documents in the user group
                        [gp.lower() for gp in get_user_groups(user)]).query_wrap())
    except Http404:
        # not logged in
        pass
    return auth_arr
Esempio n. 15
0
 def get_studies(cls, study_ids=None, disease_code=None, sources=[], split_name=True):
     studies_query = ElasticQuery(Query.match_all(), sources=sources)
     if disease_code is not None:
         studies_query = ElasticQuery(BoolQuery(must_arr=Query.term("diseases", disease_code)), sources=sources)
     elif study_ids:
         studies_query = ElasticQuery(Query.ids(study_ids), sources=sources)
     studies = Search(studies_query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=200).search().docs
     for doc in studies:
         if split_name and getattr(doc, 'study_name') is not None:
             setattr(doc, 'study_name', getattr(doc, 'study_name').split(':', 1)[0])
     return Document.sorted_alphanum(studies, "study_id")
 def test_or_filtered_query(self):
     ''' Test building and running a filtered query. '''
     highlight = Highlight(["id", "seqid"])
     query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1),
                                      RangeQuery("end", gte=100000)])
     or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000))
     or_filter.extend(query_bool) \
              .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap())
     query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
 def test_query_ids(self):
     ''' Test by query ids. '''
     query = ElasticQuery(Query.ids(['1', '2']))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)")
     idx_type = docs[0].type()
     query = ElasticQuery(Query.ids('2', types=idx_type))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
    def test_bool_nested_filter(self):
        ''' Test combined Bool filter '''
        query_bool_nest = BoolQuery()
        query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \
                       .must(Query.term("seqid", 1))

        query_bool = BoolQuery()
        query_bool.should(query_bool_nest) \
                  .should(Query.term("seqid", 2))
        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
    def test_bool_filtered_query4(self):
        ''' Test building and running a filtered boolean query.
        Note: ElasticQuery used to wrap match in a query object. '''
        query_bool = BoolQuery()
        query_bool.should(RangeQuery("start", lte=20000)) \
                  .should(Query.term("seqid", 2)) \
                  .must(Query.match("id", "rs768019142").query_wrap()) \
                  .must(Query.term("seqid", 1))

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
 def test_bool_query(self):
     ''' Test a bool query. '''
     query_bool = BoolQuery()
     highlight = Highlight(["id", "seqid"])
     query_bool.must(Query.term("id", "rs768019142")) \
               .must(RangeQuery("start", gt=1000)) \
               .must_not(Query.match("seqid", "2")) \
               .should(Query.match("seqid", "3")) \
               .should(Query.match("seqid", "1"))
     query = ElasticQuery.bool(query_bool, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
Esempio n. 21
0
    def get_elastic_query(cls, section=None, config=None):
        ''' function to build the elastic query object
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @return: L{Query}
        '''
        section_config = config[section]
        source_fields = []

        if 'source_fields' in section_config:
            source_fields_str = section_config['source_fields']
            source_fields = source_fields_str.split(',')

        if 'mhc' in section:
            seqid = '6'
            start_range = 25000000
            end_range = 35000000

            seqid_param = section_config['seqid_param']
            start_param = section_config['start_param']
            end_param = section_config['end_param']

        if section == 'is_gene_in_mhc':
            # for region you should make a different query
            # Defined MHC region as chr6:25,000,000..35,000,000

            query = ElasticUtils.range_overlap_query(seqid, start_range, end_range,
                                                     source_fields,
                                                     seqid_param,
                                                     start_param,
                                                     end_param)
        elif section == 'is_marker_in_mhc':
            query_bool = BoolQuery()
            query_bool.must(RangeQuery("start", lte=end_range)) \
                      .must(RangeQuery("start", gte=start_range)) \
                      .must(Query.term("seqid", seqid))
            query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elif section == 'is_region_in_mhc':
            query = ElasticQuery(Query.term("region_name", "MHC"))
        elif section == 'marker_is_gwas_significant_in_ic':
            # build a range query
            gw_sig_p = 0.00000005
            query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p))
        else:
            if len(source_fields) > 0:
                query = ElasticQuery(Query.match_all(), sources=source_fields)
            else:
                # query = ElasticQuery(Query.match_all())
                return None

        return query
Esempio n. 22
0
 def test_update_doc(self):
     ''' Update with a partial document. '''
     idx = IDX['MARKER']['indexName']
     docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs
     self.assertEquals(len(docs), 1, "rs2476601 document")
     update_field = {"doc": {"start": 100, "end": 200}}
     Update.update_doc(docs[0], update_field)
     Search.index_refresh(IDX['MARKER']['indexName'])
     docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs
     self.assertEquals(len(docs), 1, "rs2476601 document")
     self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start")
     self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
Esempio n. 23
0
def show_disease(disease, scores, text=True, selected=None, href="/disease/"):
    ''' Template inclusion tag to render disease bar. '''
    if isinstance(disease, str):
        if disease == 'OD':
            disease = Document({"_source": {"code": "OD", "colour": "grey", "name": "Other Diseases"}})
        else:
            query = ElasticQuery(BoolQuery(should_arr=[Query.term('code', disease.lower()),
                                                       Query.term('name', disease.lower())]))
            disease = Search(query, idx=ElasticSettings.idx('DISEASE'), size=1).search().docs[0]
    score = ''
    if scores != '':
        score = scores[0]
    return {'disease': disease, 'score': score, 'text': text, 'selected': selected, 'href': href}
Esempio n. 24
0
    def post(self, request, *args, **kwargs):
        ens_id = self.request.POST.get('ens_id')
        marker = self.request.POST.get('marker')
        markers = self.request.POST.getlist('markers[]')

        if ens_id:
            sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
        elif marker:
            sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap())
        elif markers:
            sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap())

        query = ElasticQuery.filtered(Query.match_all(), sfilter)
        elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500)
        study_hits = elastic.get_json_response()['hits']

        ens_ids = []
        pmids = []
        for hit in study_hits['hits']:
            if 'pmid' in hit['_source']:
                pmids.append(hit['_source']['pmid'])
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    ens_ids.append(ens_id)
        docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
        pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal'])

        for hit in study_hits['hits']:
            genes = {}
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    try:
                        genes[ens_id] = getattr(docs[ens_id], 'symbol')
                    except KeyError:
                        genes = {ens_id: ens_id}
            hit['_source']['genes'] = genes
            if 'pmid' in hit['_source']:
                pmid = hit['_source']['pmid']
                try:
                    authors = getattr(pub_docs[pmid], 'authors')
                    journal = getattr(pub_docs[pmid], 'journal')
                    hit['_source']['pmid'] = \
                        {'pmid': pmid,
                         'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "",
                         'journal': journal}
                except KeyError:
                    hit['_source']['pmid'] = {'pmid': pmid}

        return JsonResponse(study_hits)
Esempio n. 25
0
def _build_exon_query(chrom, segmin, segmax, genes):
    # get exonic structure for genes in this section
    geneExons = dict()
    query_bool = BoolQuery()
    query_bool.must([Query.term("seqid", chrom)])
    if len(genes) > 0:
        for g in genes:
            query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]),
                                               query_bool, sources=utils.snpFields)
            elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000)
            result = elastic.get_result()
            exons = result['data']
            exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons)
            geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start"))
    return geneExons
Esempio n. 26
0
    def pad_region_doc(cls, region):
        '''Adds details of disease_loci & hits for a given region doc'''
        hits_idx = ElasticSettings.idx('REGION', 'STUDY_HITS')

        disease_loci = getattr(region, "disease_loci")

        locus_start = Agg('region_start', 'min', {'field': 'build_info.start'})
        locus_end = Agg('region_end', 'max', {'field': 'build_info.end'})
        match_agg = Agg('filtered_result', 'filter', Query.match("build_info.build", 38).query_wrap(),
                        sub_agg=[locus_start, locus_end])
        build_info_agg = Agg('build_info', 'nested', {"path": 'build_info'}, sub_agg=[match_agg])

        query = ElasticQuery(FilteredQuery(Query.terms("disease_locus", disease_loci),
                                           Filter(BoolQuery(should_arr=[Query.missing_terms("field", "group_name")]
                                                            ))))
        resultObj = Search(search_query=query, idx=hits_idx, aggs=Aggs(build_info_agg)).search()

        hit_ids = []
        markers = []
        genes = []
        studies = []
        pmids = []
        for doc in resultObj.docs:
            hit_ids.append(doc.doc_id())
            markers.append(getattr(doc, "marker"))
            if hasattr(doc, "genes") and getattr(doc, "genes") != None:
                genes.extend([g for g in getattr(doc, "genes")])
            studies.append(getattr(doc, "dil_study_id"))
            pmids.append(getattr(doc, "pmid"))

        build_info = getattr(resultObj.aggs['build_info'], 'filtered_result')
        region_start = int(build_info['region_start']['value'])
        region_end = int(build_info['region_end']['value'])

        build_info = {
            'build': 38,
            'seqid': getattr(region, "seqid"),
            'start': region_start,
            'end': region_end
        }
        setattr(region, "build_info", build_info)
        setattr(region, "hits", hit_ids)
        setattr(region, "markers", list(set(markers)))
        setattr(region, "genes", list(set(genes)))
        setattr(region, "studies", list(set(studies)))
        setattr(region, "pmids", list(set(pmids)))

        return region
    def test_pubs_disease_tags(self):
        ''' Check the number of disease publications against the number of tags.disease and
        report differences`. '''
        count = True
        msg = ''
        for disease in DiseasePublicationTest.DISEASES:
            pmids = self._get_pmids(disease)
            disease_code = disease.lower()
            elastic = Search(search_query=ElasticQuery(BoolQuery(
                         b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']),
                         idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
            res = elastic.get_count()
            msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids))
            if res['count'] != len(pmids):
                count = False
                docs = elastic.search().docs
                pmids_in_idx = [getattr(doc, 'pmid') for doc in docs]
                pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids]
                pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx]
                if len(pmids_diff1) > 0:
                    msg += '\textra PMIDs: '+str(pmids_diff1)
                if len(pmids_diff2) > 0:
                    msg += '\tmissing PMIDs: '+str(pmids_diff2)
            msg += '\n'

        print(msg)
        self.assertTrue(count, 'Count for disease tags')
Esempio n. 28
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from Rserve. '''
        try:
            filterable = getattr(view, 'filter_fields', [])
            filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])

            mid1 = filters.get('marker', 'rs2476601')
            dataset = filters.get('dataset', 'EUR').replace('-', '')
            query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start'])
            elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1)
            doc = elastic.search().docs[0]
            seqid = getattr(doc, 'seqid')

            rserve = getattr(settings, 'RSERVE')
            conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT'))
            pop_str = conn.r.get_pop(dataset, seqid, mid1)

            pops = json.loads(str(pop_str))
            populations = []
            for pop in pops:
                pops[pop]['population'] = pop
                populations.append(pops[pop])
            conn.close()
            return [ElasticObject(initial={'populations': populations, 'marker': mid1})]
        except (TypeError, ValueError, IndexError, ConnectionError):
            return [ElasticObject(initial={'populations': None, 'marker': mid1})]
 def test_error(self):
     score_function = ScoreFunction.create_score_function('field_value_factor', field='start')
     self.assertRaises(QueryError, FunctionScoreQuery, 'test_not_query', [score_function])
     self.assertRaises(QueryError, FunctionScoreQuery, Query.match_all(), ['test_not_function_score'])
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'blah')
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', random_scoress='val')
     self.assertRaises(QueryError, ScoreFunction.create_score_function, 'field_value_factor', field=10)
Esempio n. 30
0
    def get_disease_tags(cls, feature_id, idx=None, idx_type=None):
        ''' function to get the aggregated list of disease_tags for a given feature id, aggregated
            from all criteria_types for a feature type
        @type  feature_id: string
        @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id)
              @type  idx: string
        @param idx: name of the index
        @type  idx_type: string
        @param idx_type: name of the idx type, each criteria is an index type
        '''
        query = ElasticQuery(Query.term("qid", feature_id))
        agg = Agg("criteria_disease_tags", "terms", {"field": "disease_tags", "size": 0})
        aggs = Aggs(agg)

        if idx_type:
            search = Search(query, aggs=aggs, idx=idx, idx_type=idx_type)
        else:
            search = Search(query, aggs=aggs, idx=idx)

        disease_tags = []
        try:
            r_aggs = search.search().aggs
            buckets = r_aggs['criteria_disease_tags'].get_buckets()
            disease_tags = [dis_dict['key'].lower() for dis_dict in buckets]
        except:
            return []

        # get disease docs
        if (len(disease_tags) > 0):
            (core, other) = Disease.get_site_diseases(dis_list=disease_tags)
            diseases = list(core)
            diseases.extend(other)
            return diseases
        else:
            return None
Esempio n. 31
0
 def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None):
     ''' Get the gene symbols for the corresponding array of ensembl IDs.
     A dictionary is returned with the key being the ensembl ID and the
     value the gene document. '''
     query = ElasticQuery(Query.ids(ens_ids), sources=sources)
     elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids))
     return {doc.doc_id(): doc for doc in elastic.search().docs}
 def test_missing_terms_filtered_query(self):
     ''' Test filtered query with a missing terms filter. '''
     terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name")
     query = ElasticQuery.filtered(Query.match_all(), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        print(filterable)
        print(request)
        filters = dict([(k, v) for k, v in request.GET.items() if k in filterable])
        criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA'))

        idx = criteria_idx
        if type(criteria_idx) == list:
            idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx)
        else:
            idx = ElasticSettings.idx(criteria_idx)

        q = ElasticQuery(Query.match_all())
        s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            new_obj.criteria_type = result['_type']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
Esempio n. 34
0
    def gene_mgi_parse(cls, gene_pubs, idx):
        ''' Parse Ensembl and MGI data from JAX. '''
        orthogenes_mgi = {}
        for gene_mgi in gene_pubs:
            parts = gene_mgi.split('\t')
            if 'MGI:' not in parts[0]:
                raise PipelineError('MGI not found '+parts[0])
            if 'ENSMUSG' not in parts[5]:
                raise PipelineError('ENSMUSG not found '+parts[5])
            orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '')

        orthogene_keys = list(orthogenes_mgi.keys())
        chunk_size = 450
        for i in range(0, len(orthogene_keys), chunk_size):
            chunk_gene_keys = orthogene_keys[i:i+chunk_size]
            json_data = ''
            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl",
                                                                       chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc.doc_id()
                idx_type = doc.type()
                mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus']
                mm['MGI'] = orthogenes_mgi[mm['ensembl']]
                dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}}
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': dbxrefs}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
Esempio n. 35
0
    def gene2ensembl_parse(cls, gene2ens, idx, idx_type):
        ''' Parse gene2ensembl file from NCBI and add entrez to gene index. '''
        genes = {}
        for gene in gene2ens:
            if gene.startswith('9606\t'):
                parts = gene.split('\t')
                gene_id = parts[1]
                ens_id = parts[2]
#                 prot_acc = parts[5]
                if ens_id not in genes:
                    genes[ens_id] = {'dbxrefs': {'entrez': gene_id}}

        query = ElasticQuery(Query.ids(list(genes.keys())))
        docs = Search(query, idx=idx, idx_type=idx_type, size=80000).search().docs

        chunk_size = 450
        for i in range(0, len(docs), chunk_size):
            docs_chunk = docs[i:i+chunk_size]
            json_data = ''
            for doc in docs_chunk:
                ens_id = doc._meta['_id']
                idx_type = doc.type()
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': genes[ens_id]}) + '\n'
            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
Esempio n. 36
0
    def gene2ensembl_parse(cls, gene2ens, idx, idx_type):
        ''' Parse gene2ensembl file from NCBI and add entrez to gene index. '''
        genes = {}
        for gene in gene2ens:
            if gene.startswith('9606\t'):
                parts = gene.split('\t')
                gene_id = parts[1]
                ens_id = parts[2]
#                 prot_acc = parts[5]
                if ens_id not in genes:
                    genes[ens_id] = {'dbxrefs': {'entrez': gene_id}}

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            chunk_size = 450
            for i in range(0, len(docs), chunk_size):
                docs_chunk = docs[i:i+chunk_size]
                json_data = ''
                for doc in docs_chunk:
                    ens_id = doc._meta['_id']
                    idx_type = doc.type()
                    doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                           "_index": idx, "_retry_on_conflict": 3}}
                    json_data += json.dumps(doc_data) + '\n'
                    json_data += json.dumps({'doc': genes[ens_id]}) + '\n'
                if json_data != '':
                    Loader().bulk_load(idx, idx_type, json_data)

        query = ElasticQuery(Query.ids(list(genes.keys())))
        ScanAndScroll.scan_and_scroll(idx, idx_type=idx_type, call_fun=process_hits, query=query)
Esempio n. 37
0
def get_criteria(docs, doc_type, doc_attr, idx_type_key):
    """ Return a dictionary of gene name:criteria. """
    genes = [getattr(doc, doc_attr).lower() for doc in docs if doc.type() == doc_type]
    query = Query.terms("Name", genes)
    sources = {"exclude": ["Primary id", "Object class", "Total score"]}
    if ElasticSettings.idx("CRITERIA", idx_type_key) is None:
        return {}
    res = Search(
        ElasticQuery(query, sources=sources), idx=ElasticSettings.idx("CRITERIA", idx_type_key), size=len(genes)
    ).search()
    criteria = {}

    for doc in res.docs:
        od = collections.OrderedDict(sorted(doc.__dict__.items(), key=lambda t: t[0]))
        gene_name = getattr(doc, "Name")
        criteria[gene_name] = [
            {attr.replace("_Hs", ""): value.split(":")}
            for attr, value in od.items()
            if attr != "Name" and attr != "_meta" and attr != "OD_Hs" and not value.startswith("0")
        ]
        if hasattr(doc, "OD_Hs") and not getattr(doc, "OD_Hs").startswith("0"):
            if gene_name not in criteria:
                criteria[gene_name] = []
            criteria[gene_name].append({"OD": getattr(doc, "OD_Hs").split(":")})

    return criteria
Esempio n. 38
0
def _get_pub_docs_by_pmid(pmids, sources=None):
    """ Get the gene symbols for the corresponding array of ensembl IDs.
    A dictionary is returned with the key being the ensembl ID and the
    value the gene document. """
    query = ElasticQuery(Query.ids(pmids), sources=sources)
    elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids))
    return {doc.doc_id(): doc for doc in elastic.search().docs}
Esempio n. 39
0
 def _get_random_marker(self):
     ''' Get a random marker from the dbSNP elastic index. '''
     (idx, idx_type) = ElasticSettings.idx('MARKER', 'MARKER').split('/')
     seqid = random.randint(1, 10)
     qbool = BoolQuery(must_arr=[Query.term("seqid", seqid), RangeQuery("tags.weight", gte=80)])
     doc = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool, sources=['id', 'start'], size=1)[0]
     return getattr(doc, 'id')
Esempio n. 40
0
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        newgene_ids = {}
        discountinued_geneids = []

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            for doc in docs:
                geneid = getattr(doc, 'geneid')
                discontinued_geneid = getattr(doc, 'discontinued_geneid')
                if geneid is None:
                    discountinued_geneids.append(str(discontinued_geneid))
                else:
                    newgene_ids[str(discontinued_geneid)] = str(geneid)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'],
                                      call_fun=process_hits, query=query)

        return (newgene_ids, discountinued_geneids)
Esempio n. 41
0
    def test_bulk(self):
        ''' Test the Bulk.load(). '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']

        json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \
                    (idx, 'marker')
        json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".",
                                 "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"})
        resp = Bulk.load(idx, '', json_data)
        self.assertNotEquals(resp.status_code, 200)

        # note: needs a trailing line return to work
        Bulk.load(idx, '', json_data + '\n')
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1+1, "contains documents")

        # produce errors updating doc id that doesn't exist
        json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"doc": {"start": 100, "end": 200}}\n'
        resp = Bulk.load(idx, '', json_data)
        self.assertTrue('errors' in resp.json() and resp.json()['errors'])
Esempio n. 42
0
    def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section):
        ''' Get an ensembl:entrez id dictionary. '''
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs
        return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
Esempio n. 43
0
 def test_doc(self):
     ''' Test return correct type of FeatureDocument. '''
     idx = PydginTestSettings.IDX['GENE']['indexName']
     idx_type = PydginTestSettings.IDX['GENE']['indexType']
     res = Search(search_query=ElasticQuery(Query.match_all(), sources=['symbol']),
                  idx=idx, idx_type=idx_type, size=2).search()
     for doc in res.docs:
         self.assertTrue(isinstance(doc, GeneDocument))
Esempio n. 44
0
    def get_overlapping_hits(self, build, seqid, start, end):
        query_bool = BoolQuery(must_arr=[RangeQuery("build_info.start", lte=start),
                                         RangeQuery("build_info.end", gte=end)])
        or_filter = OrFilter(RangeQuery("build_info.start", gte=start, lte=end))
        or_filter.extend(RangeQuery("build_info.end", gte=start, lte=end)) \
                 .extend(query_bool)
        range_query = FilteredQuery(BoolQuery(must_arr=[Query.term("build_info.seqid", seqid),
                                                        Query.term("build_info.build", build)]),
                                    or_filter)

        query = ElasticQuery.filtered_bool(
            Query.nested("build_info", range_query),
            BoolQuery(must_arr=[RangeQuery("tier", lte=2)]),
            # sources=["disease", "marker", "chr_band", "tier", "build_info", "disease_locus"]
            )
        elastic = Search(search_query=query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'))
        return elastic.search().docs
Esempio n. 45
0
 def get_publications(cls, pmids, sources=[]):
     ''' Get publications from the list of PMIDs. '''
     if pmids is None or not pmids:
         return None
     from elastic.search import Search, ElasticQuery
     pubs = Search(ElasticQuery(Query.ids(pmids), sources=sources),
                   idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'), size=2).search().docs
     return pubs
Esempio n. 46
0
def _get_query_filters(q_dict, user):
    ''' Build query bool filter. If biotypes are specified add them to the filter and
    allow for other non-gene types.
    @type  q_dict: dict
    @param q_dict: request dictionary.
    '''
    if not q_dict.getlist("biotypes"):
        return None

    query_bool = BoolQuery()
    if q_dict.getlist("biotypes"):
        query_bool.should(Query.terms("biotype", q_dict.getlist("biotypes")))
        type_filter = [Query.query_type_for_filter(ElasticSettings.search_props(c.upper(), user)['idx_type'])
                       for c in q_dict.getlist("categories") if c != "gene"]
        if len(type_filter) > 0:
            query_bool.should(type_filter)
    return Filter(query_bool)
Esempio n. 47
0
    def get_rdm_feature_id(cls, idx, idx_type, qbool=Query.match_all(), sources=[], field=None):
        ''' Get a random feature id from the indices. '''
        doc = cls.get_rdm_docs(idx, idx_type, qbool=qbool, sources=sources, size=1)[0]

        if field is not None:
            return getattr(doc, field)

        return doc.doc_id()
 def test_sort_query(self):
     ''' Test sorting for a query. '''
     query = ElasticQuery(Query.match_all())
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score'))
     self._check_sort_order(elastic.search().docs)
     qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]})
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort)
     self._check_sort_order(elastic.search().docs)
     self.assertRaises(QueryError, Sort, 1)
Esempio n. 49
0
    def query_string(cls, query_term, sources=None, highlight=None, query_filter=None, **string_opts):
        ''' Factory method for creating elastic Query String Query.

        @type  query_term: string
        @param query_term: The string to use in the query.
        @type  sources: array of result fields
        @keyword sources: The _source filtering to be used (default: None).
        @type  highlight: Highlight
        @keyword highlight: Define the highlighting of results (default: None).
        @type query_filter: Filter
        @keyword query_filter: Optional filter for query.
        @return: L{ElasticQuery}
        '''
        if query_filter is None:
            query = Query.query_string(query_term, **string_opts)
        else:
            query = FilteredQuery(Query.query_string(query_term, **string_opts), query_filter)
        return cls(query, sources, highlight)
Esempio n. 50
0
    def test_get_rdm_feature_id(self):
        ''' Test get random feature id. '''
        idx = IDX['GFF_GENERIC']['indexName']
        idx_type = IDX['GFF_GENERIC']['indexType']
        doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type)

        self.assertTrue(isinstance(doc_id, str), 'Document id')
        docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs
        self.assertTrue(len(docs) == 1, 'Document retrieved')
Esempio n. 51
0
    def test_delete_docs_by_query(self):
        ''' Test deleting docs using a query. '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']
        self.assertGreater(hits_total1, 0, "contains documents")

        # delete single doc
        Delete.docs_by_query(idx, query=Query.term("id", "rs2476601"))
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1-1, "contains documents")

        # delete remaining docs
        Delete.docs_by_query(idx, 'marker')
        Search.index_refresh(idx)
        self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
 def test_terms_query(self):
     ''' Test building and running a match query. '''
     highlight = Highlight(["id"])
     query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2,
                     "Elastic string query retrieved markers (rs2476601, rs768019142)")
     self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found")
     self.assertTrue(docs[0].highlight() is not None, "highlighting found")
Esempio n. 53
0
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx)

    fragsResult = fragsQuery.get_result()
    frags = fragsResult['data']
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
Esempio n. 54
0
 def test_search_count(self):
     ''' Test index and search counts. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     count1 = ElasticUtils.get_docs_count(idx, idx_type)
     self.assertGreater(count1, 0, 'index count')
     search_query = ElasticQuery(
         BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')]))
     count2 = ElasticUtils.get_docs_count(idx,
                                          idx_type,
                                          search_query=search_query)
     self.assertGreater(count1, count2, 'search query count')
 def test_function_score_query(self):
     ''' Test a function score query with a query (using the start position as the score). '''
     score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal')
     query_string = Query.query_string("rs*", fields=["id", "seqid"])
     query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace'))
     docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs
     self.assertGreater(len(docs), 1, str(len(docs)))
     last_start = 0
     for doc in docs:
         start = getattr(doc, 'start')
         self.assertLess(last_start, start)
         last_start = start
Esempio n. 56
0
    def docs_by_query(cls, idx, idx_type='', query=Query.match_all()):
        ''' Delete all documents specified by a Query. '''
        def delete_docs(resp_json):
            hits = resp_json['hits']['hits']
            json_data = ''
            for hit in hits:
                json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                             (hit['_index'], hit['_type'], hit['_id'])
            Bulk.load(idx, idx_type, json_data)

        query = ElasticQuery(query, sources='_id')
        ScanAndScroll.scan_and_scroll(idx, idx_type=idx_type, call_fun=delete_docs, query=query)
Esempio n. 57
0
    def _get_current_build_info(self, seqid, position):
        ''' Get upper & lower boundaries for a hit given the position of the marker.'''

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("position", gte=position),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        genetic_map_position = getattr(result.docs[0], "genetic_map_position")

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           gte=(genetic_map_position + 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        start = int(getattr(result.docs[0], "position"))

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           lte=(genetic_map_position - 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:desc'),
                        size=1).search()
        end = int(getattr(result.docs[0], "position"))

        build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end}
        return build_info
Esempio n. 58
0
    def get_object(self):
        q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field]))
        s = Search(search_query=q, idx=getattr(self, 'idx'))
        try:
            result = s.get_json_response()['hits']['hits'][0]
            obj = ElasticObject(initial=result['_source'])
            obj.uuid = result['_id']

            # May raise a permission denied
            self.check_object_permissions(self.request, obj)
            return obj
        except (TypeError, ValueError, IndexError):
            raise Http404
    def test_mapping_parent_child(self):
        ''' Test creating mapping with parent child relationship. '''
        gene_mapping = MappingProperties("gene")
        gene_mapping.add_property("symbol", "string", analyzer="full_name")
        inta_mapping = MappingProperties("publication", "gene")
        load = Loader()
        idx = "test__mapping__"+SEARCH_SUFFIX
        options = {"indexName": idx, "shards": 1}
        requests.delete(ElasticSettings.url() + '/' + idx)

        # add child mappings first
        status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping inteactions")
        status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping genes")

        ''' load docs and test has parent query'''
        json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx
        json_data += json.dumps({"symbol": "PAX1"}) + '\n'
        json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx
        json_data += json.dumps({"pubmed": 1234}) + '\n'
        Bulk.load(idx, '', json_data)
        Search.index_refresh(idx)
        query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1'))
        elastic = Search(query, idx=idx, idx_type='publication', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'pubmed'), 1234)
        self.assertEquals(docs[0].parent(), '1')
        self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx')

        ''' test has child query '''
        query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234))
        elastic = Search(query, idx=idx, idx_type='gene', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1')
        self.assertEquals(docs[0].parent(), None)
        requests.delete(ElasticSettings.url() + '/' + idx)