コード例 #1
0
ファイル: views.py プロジェクト: D-I-L/django-chicp
def _find_snp_position(snp_track, name):
    if snp_track is None:
        query = ElasticQuery.query_match("id", name)
        elastic = Search(query, idx=ElasticSettings.idx('MARKER'))
        snpResult = elastic.get_json_response()
        if(len(snpResult['hits']['hits'])) > 0:
            snp = snpResult['hits']['hits'][0]['_source']
            chrom = snp['seqid'].replace('chr', "")
            position = snp['start']
            return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name}
    else:
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        try:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper())
        except SettingsError:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track

        query = ElasticQuery.query_match("name", name)
        elastic = Search(query, idx=snp_track_idx)
        snpResult = elastic.get_json_response()
        if(len(snpResult['hits']['hits'])) > 0:
            snp = snpResult['hits']['hits'][0]['_source']
            chrom = snp['seqid'].replace('chr', "")
            position = snp['start']
            return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name}

    return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
コード例 #2
0
ファイル: gene_tags.py プロジェクト: tcarver/django_template
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", lte=start_pos),
            RangeQuery("stop", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("chromosome", seqid),
            RangeQuery("start", gte=start_pos),
            RangeQuery("stop", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
コード例 #3
0
def _find_snp_position(snp_track, name):
    if snp_track is None:
        query = ElasticQuery.query_match("id", name)
        elastic = Search(query, idx=ElasticSettings.idx('MARKER'))
        snpResult = elastic.get_json_response()
        if(len(snpResult['hits']['hits'])) > 0:
            snp = snpResult['hits']['hits'][0]['_source']
            chrom = snp['seqid'].replace('chr', "")
            position = snp['start']
            return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name}
    else:
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        try:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper())
        except SettingsError:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track

        query = ElasticQuery.query_match("name", name)
        elastic = Search(query, idx=snp_track_idx)
        snpResult = elastic.get_json_response()
        if(len(snpResult['hits']['hits'])) > 0:
            snp = snpResult['hits']['hits'][0]['_source']
            chrom = snp['seqid'].replace('chr', "")
            position = snp['start']
            return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name}

    return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
コード例 #4
0
    def filter_queryset(self, request, queryset, view):
        ''' Override this method to request just the documents required from elastic. '''
        q_size = view.paginator.get_limit(request)
        q_from = view.paginator.get_offset(request)

        filterable = getattr(view, 'filter_fields', [])
        filters = dict([(k, v) for k, v in request.GET.items()
                        if k in filterable])
        search_filters = self._build_filters(filters=filters)
        if search_filters is not None:
            q = ElasticQuery.filtered(Query.match_all(), search_filters)
        else:
            q = ElasticQuery(Query.match_all())
        s = Search(search_query=q,
                   idx=getattr(view, 'idx'),
                   size=q_size,
                   search_from=q_from)
        json_results = s.get_json_response()
        results = []
        for result in json_results['hits']['hits']:
            new_obj = ElasticObject(initial=result['_source'])
            new_obj.uuid = result['_id']
            results.append(new_obj)
        view.es_count = json_results['hits']['total']
        return results
コード例 #5
0
def show_es_gene_section(gene_symbol=None,
                         seqid=None,
                         start_pos=None,
                         end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    if seqid is not None and isinstance(seqid,
                                        str) and seqid.startswith("chr"):
        seqid = seqid
    else:
        seqid = 'chr' + str(seqid)
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("gene_symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", lte=start_pos),
            RangeQuery("featureloc.end", gte=start_pos)
        ])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[
            Query.match("seqid", seqid),
            RangeQuery("featureloc.start", gte=start_pos),
            RangeQuery("featureloc.end", lte=end_pos)
        ])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
コード例 #6
0
    def test_term_query(self):
        ''' Test building and running a match query. '''
        query = ElasticQuery(Query.term("id", "rs2476601"))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)")

        query = ElasticQuery(Query.term("seqid", "1", boost=3.0))
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers  on chr1")
コード例 #7
0
 def test_query_ids(self):
     ''' Test by query ids. '''
     query = ElasticQuery(Query.ids(['1', '2']))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)")
     idx_type = docs[0].type()
     query = ElasticQuery(Query.ids('2', types=idx_type))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5)
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
コード例 #8
0
 def test_update_doc(self):
     ''' Update with a partial document. '''
     idx = IDX['MARKER']['indexName']
     docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs
     self.assertEquals(len(docs), 1, "rs2476601 document")
     update_field = {"doc": {"start": 100, "end": 200}}
     Update.update_doc(docs[0], update_field)
     Search.index_refresh(IDX['MARKER']['indexName'])
     docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs
     self.assertEquals(len(docs), 1, "rs2476601 document")
     self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start")
     self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
コード例 #9
0
 def test_string_query(self):
     ''' Test building and running a string query. '''
     query = ElasticQuery.query_string("rs2476601", fields=["id"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search()
     self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)")
     self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
コード例 #10
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def gene_mgi_parse(cls, gene_pubs, idx):
        ''' Parse Ensembl and MGI data from JAX. '''
        orthogenes_mgi = {}
        for gene_mgi in gene_pubs:
            parts = gene_mgi.split('\t')
            if 'MGI:' not in parts[0]:
                raise PipelineError('MGI not found '+parts[0])
            if 'ENSMUSG' not in parts[5]:
                raise PipelineError('ENSMUSG not found '+parts[5])
            orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '')

        orthogene_keys = list(orthogenes_mgi.keys())
        chunk_size = 450
        for i in range(0, len(orthogene_keys), chunk_size):
            chunk_gene_keys = orthogene_keys[i:i+chunk_size]
            json_data = ''
            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl",
                                                                       chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc.doc_id()
                idx_type = doc.type()
                mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus']
                mm['MGI'] = orthogenes_mgi[mm['ensembl']]
                dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}}
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': dbxrefs}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
コード例 #11
0
    def test_bulk(self):
        ''' Test the Bulk.load(). '''
        self.set_up()
        idx = IDX['MARKER']['indexName']
        elastic = Search(ElasticQuery(Query.match_all()), idx=idx)
        hits_total1 = elastic.get_count()['count']

        json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \
                    (idx, 'marker')
        json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".",
                                 "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"})
        resp = Bulk.load(idx, '', json_data)
        self.assertNotEquals(resp.status_code, 200)

        # note: needs a trailing line return to work
        Bulk.load(idx, '', json_data + '\n')
        Search.index_refresh(idx)
        hits_total2 = elastic.get_count()['count']
        self.assertEquals(hits_total2, hits_total1+1, "contains documents")

        # produce errors updating doc id that doesn't exist
        json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \
                     (idx, 'marker', 'XYZ')
        json_data += '{"doc": {"start": 100, "end": 200}}\n'
        resp = Bulk.load(idx, '', json_data)
        self.assertTrue('errors' in resp.json() and resp.json()['errors'])
コード例 #12
0
 def test_missing_terms_filtered_query(self):
     ''' Test filtered query with a missing terms filter. '''
     terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name")
     query = ElasticQuery.filtered(Query.match_all(), terms_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
コード例 #13
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        newgene_ids = {}
        discountinued_geneids = []

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            for doc in docs:
                geneid = getattr(doc, 'geneid')
                discontinued_geneid = getattr(doc, 'discontinued_geneid')
                if geneid is None:
                    discountinued_geneids.append(str(discontinued_geneid))
                else:
                    newgene_ids[str(discontinued_geneid)] = str(geneid)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'],
                                      call_fun=process_hits, query=query)

        return (newgene_ids, discountinued_geneids)
コード例 #14
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section):
        ''' Get an ensembl:entrez id dictionary. '''
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs
        return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
コード例 #15
0
        def check_hits(resp_json):
            rsids = {}
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc in docs:
                rsid = getattr(doc, "id")
                if rsid is not None:
                    rsids[rsid] = doc
            rsids_keys = list(rsids.keys())
            terms_filter = TermsFilter.get_terms_filter("id", rsids_keys)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter)
            elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys))
            docs_by_rsid = elastic.search().docs
            for doc in docs_by_rsid:
                info = getattr(doc, "info")
                if 'VC=SNV' not in info:
                    continue
                rsid = getattr(doc, "id")
                ic_doc = rsids[rsid]
                pos1 = getattr(doc, "start")
                pos2 = self._get_highest_build(ic_doc)['position']
                if abs(int(pos1) - int(pos2)) > 1:
                    is_par = getattr(ic_doc, 'is_par')
                    allele_a = getattr(ic_doc, 'allele_a')
                    if is_par is None and not (allele_a == 'D' or allele_a == 'I'):
                        msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') +
                               ' '+str(pos2)+" "+rsid+' '+str(pos1))
#                                ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')'

                        query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')),
                                                      Filter(Query.term("start", pos2)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")"

                        query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid)))
                        elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY'))
                        docs_by_pos = elastic.search().docs
                        if len(docs_by_pos) > 0:
                            for d in docs_by_pos:
                                msg += " (rshigh:"+str(getattr(d, "rshigh")) + \
                                       " build_id:"+str(getattr(d, "build_id"))+")"

                        logger.error(msg)
コード例 #16
0
 def test_bool_filtered_query(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)],
                            should_arr=[RangeQuery("start", gte=10050)])
     query_bool.must([Query.term("id", "rs768019142")]) \
               .should(RangeQuery("start", gte=10054))
     query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
コード例 #17
0
 def test_sort_query(self):
     ''' Test sorting for a query. '''
     query = ElasticQuery(Query.match_all())
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score'))
     self._check_sort_order(elastic.search().docs)
     qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]})
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort)
     self._check_sort_order(elastic.search().docs)
     self.assertRaises(QueryError, Sort, 1)
コード例 #18
0
    def test_scan_and_scroll(self):
        ''' Test scan and scroll interface. '''
        def check_hits(resp_json):
            self.assertTrue('hits' in resp_json, 'scan and scroll hits')
            self.assertGreaterEqual(len(resp_json['hits']['hits']), 1)

        ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits)
        ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits,
                                      query=ElasticQuery.query_string("rs2476601", fields=["id"]))
コード例 #19
0
 def test_and_filtered_query(self):
     ''' Test building and running a filtered query. '''
     query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)])
     and_filter = AndFilter(query_bool)
     and_filter.extend(RangeQuery("start", gte=1)) \
               .extend(Query.term("seqid", 1))
     query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
コード例 #20
0
    def test_get_rdm_feature_id(self):
        ''' Test get random feature id. '''
        idx = IDX['GFF_GENERIC']['indexName']
        idx_type = IDX['GFF_GENERIC']['indexType']
        doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type)

        self.assertTrue(isinstance(doc_id, str), 'Document id')
        docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs
        self.assertTrue(len(docs) == 1, 'Document retrieved')
コード例 #21
0
 def test_url_rotate(self):
     ''' Test the url rotates from http://xxx:9200 to correct url. '''
     query = ElasticQuery.filtered(Query.term("seqid", 1),
                                   Filter(Query.term("id", "rs768019142")))
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1,
                     "Elastic filtered query retrieved marker")
     Search.index_exists('test', 'test2')
     ElasticUrl.URL_INDEX = 0  # reset
コード例 #22
0
ファイル: views.py プロジェクト: D-I-L/django_template
def region_page(request, region):
    ''' Region elastic'''
    query = ElasticQuery.query_match("attr.region_id", region)
    elastic = Search(query, idx=ElasticSettings.idx(name='REGION'))
    context = elastic.get_result()
    context['title'] = "Region"
    print(context)
    return render(request, 'region/region.html', context,
                  content_type='text/html')
コード例 #23
0
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])

        docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs
        return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
コード例 #24
0
 def test_bool_filtered_query2(self):
     ''' Test building and running a filtered boolean query. '''
     query_bool = BoolQuery()
     query_bool.should(RangeQuery("start", lte=20000)) \
               .should(Query.term("seqid", 2)) \
               .must(Query.term("seqid", 1))
     query_string = Query.query_string("rs768019142", fields=["id", "seqid"])
     query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"])
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
コード例 #25
0
 def test_terms_query(self):
     ''' Test building and running a match query. '''
     highlight = Highlight(["id"])
     query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     docs = elastic.search().docs
     self.assertTrue(len(docs) == 2,
                     "Elastic string query retrieved markers (rs2476601, rs768019142)")
     self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found")
     self.assertTrue(docs[0].highlight() is not None, "highlighting found")
コード例 #26
0
ファイル: views.py プロジェクト: tcarver/django-chicp
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx)

    fragsResult = fragsQuery.get_result()
    frags = fragsResult['data']
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
コード例 #27
0
def region_page(request, region):
    ''' Region elastic'''
    query = ElasticQuery.query_match("attr.region_id", region)
    elastic = Search(query, idx=ElasticSettings.idx(name='REGION'))
    context = elastic.get_result()
    context['title'] = "Region"
    print(context)
    return render(request,
                  'region/region.html',
                  context,
                  content_type='text/html')
コード例 #28
0
 def test_or_filtered_query(self):
     ''' Test building and running a filtered query. '''
     highlight = Highlight(["id", "seqid"])
     query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1),
                                      RangeQuery("end", gte=100000)])
     or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000))
     or_filter.extend(query_bool) \
              .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap())
     query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
コード例 #29
0
ファイル: criteria.py プロジェクト: D-I-L/django-criteria
    def get_elastic_query(cls, section=None, config=None):
        ''' function to build the elastic query object
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @return: L{Query}
        '''
        section_config = config[section]
        source_fields = []

        if 'source_fields' in section_config:
            source_fields_str = section_config['source_fields']
            source_fields = source_fields_str.split(',')

        if 'mhc' in section:
            seqid = '6'
            start_range = 25000000
            end_range = 35000000

            seqid_param = section_config['seqid_param']
            start_param = section_config['start_param']
            end_param = section_config['end_param']

        if section == 'is_gene_in_mhc':
            # for region you should make a different query
            # Defined MHC region as chr6:25,000,000..35,000,000

            query = ElasticUtils.range_overlap_query(seqid, start_range, end_range,
                                                     source_fields,
                                                     seqid_param,
                                                     start_param,
                                                     end_param)
        elif section == 'is_marker_in_mhc':
            query_bool = BoolQuery()
            query_bool.must(RangeQuery("start", lte=end_range)) \
                      .must(RangeQuery("start", gte=start_range)) \
                      .must(Query.term("seqid", seqid))
            query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elif section == 'is_region_in_mhc':
            query = ElasticQuery(Query.term("region_name", "MHC"))
        elif section == 'marker_is_gwas_significant_in_ic':
            # build a range query
            gw_sig_p = 0.00000005
            query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p))
        else:
            if len(source_fields) > 0:
                query = ElasticQuery(Query.match_all(), sources=source_fields)
            else:
                # query = ElasticQuery(Query.match_all())
                return None

        return query
コード例 #30
0
 def test_search_count(self):
     ''' Test index and search counts. '''
     idx = IDX['GFF_GENERIC']['indexName']
     idx_type = IDX['GFF_GENERIC']['indexType']
     count1 = ElasticUtils.get_docs_count(idx, idx_type)
     self.assertGreater(count1, 0, 'index count')
     search_query = ElasticQuery(
         BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')]))
     count2 = ElasticUtils.get_docs_count(idx,
                                          idx_type,
                                          search_query=search_query)
     self.assertGreater(count1, count2, 'search query count')
コード例 #31
0
 def test_function_score_query(self):
     ''' Test a function score query with a query (using the start position as the score). '''
     score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal')
     query_string = Query.query_string("rs*", fields=["id", "seqid"])
     query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace'))
     docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs
     self.assertGreater(len(docs), 1, str(len(docs)))
     last_start = 0
     for doc in docs:
         start = getattr(doc, 'start')
         self.assertLess(last_start, start)
         last_start = start
コード例 #32
0
    def test_bool_filtered_query4(self):
        ''' Test building and running a filtered boolean query.
        Note: ElasticQuery used to wrap match in a query object. '''
        query_bool = BoolQuery()
        query_bool.should(RangeQuery("start", lte=20000)) \
                  .should(Query.term("seqid", 2)) \
                  .must(Query.match("id", "rs768019142").query_wrap()) \
                  .must(Query.term("seqid", 1))

        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
コード例 #33
0
    def test_bool_nested_filter(self):
        ''' Test combined Bool filter '''
        query_bool_nest = BoolQuery()
        query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \
                       .must(Query.term("seqid", 1))

        query_bool = BoolQuery()
        query_bool.should(query_bool_nest) \
                  .should(Query.term("seqid", 2))
        query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"])
        elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
        self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
コード例 #34
0
 def test_bool_query(self):
     ''' Test a bool query. '''
     query_bool = BoolQuery()
     highlight = Highlight(["id", "seqid"])
     query_bool.must(Query.term("id", "rs768019142")) \
               .must(RangeQuery("start", gt=1000)) \
               .must_not(Query.match("seqid", "2")) \
               .should(Query.match("seqid", "3")) \
               .should(Query.match("seqid", "1"))
     query = ElasticQuery.bool(query_bool, highlight=highlight)
     elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'))
     self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
コード例 #35
0
 def test_pub_ini_file2(self):
     ''' Test publication pipeline with a list of PMIDs. '''
     out = StringIO()
     call_command('publications', '--dir', TEST_DATA_DIR, '--steps', 'load',
                  sections='DISEASE::TEST', ini=MY_PUB_INI_FILE, stdout=out)
     INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE)
     idx = INI_CONFIG['DISEASE']['index']
     Search.index_refresh(idx)
     query = ElasticQuery.query_string("test", fields=["tags.disease"])
     elastic = Search(query, idx=idx)
     docs = elastic.search().docs
     self.assertGreater(len(docs), 1)
コード例 #36
0
    def _get_current_build_info(self, seqid, position):
        ''' Get upper & lower boundaries for a hit given the position of the marker.'''

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("position", gte=position),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        genetic_map_position = getattr(result.docs[0], "genetic_map_position")

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           gte=(genetic_map_position + 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:asc'),
                        size=1).search()
        start = int(getattr(result.docs[0], "position"))

        query = ElasticQuery(
            BoolQuery(must_arr=[
                RangeQuery("genetic_map_position",
                           lte=(genetic_map_position - 0.1)),
                Query.match("seqid", seqid)
            ]))
        result = Search(query,
                        idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'),
                        qsort=Sort('position:desc'),
                        size=1).search()
        end = int(getattr(result.docs[0], "position"))

        build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end}
        return build_info
コード例 #37
0
    def get_object(self):
        q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field]))
        s = Search(search_query=q, idx=getattr(self, 'idx'))
        try:
            result = s.get_json_response()['hits']['hits'][0]
            obj = ElasticObject(initial=result['_source'])
            obj.uuid = result['_id']

            # May raise a permission denied
            self.check_object_permissions(self.request, obj)
            return obj
        except (TypeError, ValueError, IndexError):
            raise Http404
コード例 #38
0
    def test_mapping_parent_child(self):
        ''' Test creating mapping with parent child relationship. '''
        gene_mapping = MappingProperties("gene")
        gene_mapping.add_property("symbol", "string", analyzer="full_name")
        inta_mapping = MappingProperties("publication", "gene")
        load = Loader()
        idx = "test__mapping__"+SEARCH_SUFFIX
        options = {"indexName": idx, "shards": 1}
        requests.delete(ElasticSettings.url() + '/' + idx)

        # add child mappings first
        status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping inteactions")
        status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options)
        self.assertTrue(status, "mapping genes")

        ''' load docs and test has parent query'''
        json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx
        json_data += json.dumps({"symbol": "PAX1"}) + '\n'
        json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx
        json_data += json.dumps({"pubmed": 1234}) + '\n'
        Bulk.load(idx, '', json_data)
        Search.index_refresh(idx)
        query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1'))
        elastic = Search(query, idx=idx, idx_type='publication', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'pubmed'), 1234)
        self.assertEquals(docs[0].parent(), '1')
        self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx')

        ''' test has child query '''
        query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234))
        elastic = Search(query, idx=idx, idx_type='gene', size=500)
        docs = elastic.search().docs
        self.assertEquals(len(docs), 1)
        self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1')
        self.assertEquals(docs[0].parent(), None)
        requests.delete(ElasticSettings.url() + '/' + idx)
コード例 #39
0
ファイル: gene_tags.py プロジェクト: D-I-L/django_template
def show_es_gene_section(gene_symbol=None, seqid=None,
                         start_pos=None, end_pos=None):
    ''' Template inclusion tag to render a gene section given a
    chado gene feature. '''
    seqid = str(seqid).replace('chr', '')
    if gene_symbol is not None:
        ''' gene symbol query'''
        query = ElasticQuery.query_match("symbol", gene_symbol)
    elif end_pos is None:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", lte=start_pos),
                                         RangeQuery("stop", gte=start_pos)])
        query = ElasticQuery.bool(query_bool)
    else:
        ''' start and end are same, range query for snp'''
        query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid),
                                         RangeQuery("start", gte=start_pos),
                                         RangeQuery("stop", lte=end_pos)])
        query = ElasticQuery.bool(query_bool)

    elastic = Search(query, idx=ElasticSettings.idx(name='GENE'))
    return {'es_genes': elastic.search().docs}
コード例 #40
0
 def test_function_score_filter(self):
     ''' Test a function score query with a filter. '''
     score_function = ScoreFunction.create_score_function('field_value_factor', field='start')
     bool_filter = Filter(BoolQuery(must_arr=[RangeQuery("start", lte=50000)]))
     query = ElasticQuery(FunctionScoreQuery(bool_filter, [score_function], boost_mode='replace'))
     docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs
     self.assertGreater(len(docs), 1, str(len(docs)))
     last_start = sys.maxsize
     for doc in docs:
         start = getattr(doc, 'start')
         # test that the start is equal to the score
         self.assertEqual(start, int(doc.__dict__['_meta']['_score']))
         self.assertGreater(last_start, start)
         last_start = start
コード例 #41
0
ファイル: views.py プロジェクト: D-I-L/pydgin
    def post(self, request, *args, **kwargs):
        ens_id = self.request.POST.get('ens_id')
        marker = self.request.POST.get('marker')
        markers = self.request.POST.getlist('markers[]')

        if ens_id:
            sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
        elif marker:
            sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap())
        elif markers:
            sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap())

        query = ElasticQuery.filtered(Query.match_all(), sfilter)
        elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500)
        study_hits = elastic.get_json_response()['hits']

        ens_ids = []
        pmids = []
        for hit in study_hits['hits']:
            if 'pmid' in hit['_source']:
                pmids.append(hit['_source']['pmid'])
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    ens_ids.append(ens_id)
        docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
        pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal'])

        for hit in study_hits['hits']:
            genes = {}
            if 'genes' in hit['_source']:
                for ens_id in hit['_source']['genes']:
                    try:
                        genes[ens_id] = getattr(docs[ens_id], 'symbol')
                    except KeyError:
                        genes = {ens_id: ens_id}
            hit['_source']['genes'] = genes
            if 'pmid' in hit['_source']:
                pmid = hit['_source']['pmid']
                try:
                    authors = getattr(pub_docs[pmid], 'authors')
                    journal = getattr(pub_docs[pmid], 'journal')
                    hit['_source']['pmid'] = \
                        {'pmid': pmid,
                         'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "",
                         'journal': journal}
                except KeyError:
                    hit['_source']['pmid'] = {'pmid': pmid}

        return JsonResponse(study_hits)
コード例 #42
0
ファイル: views.py プロジェクト: D-I-L/django-chicp
def _build_frags_query(frags_idx, chrom, segmin, segmax):

    query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                  Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                  utils.bedFields)
    fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx)

    # fragsResult = fragsQuery.get_result()
    # frags = fragsResult['data']
    fragsResult = fragsQuery.get_json_response()
    frags = []
    for hit in fragsResult['hits']['hits']:
        frags.append(hit['_source'])
    frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags)
    return frags
コード例 #43
0
ファイル: views.py プロジェクト: premanand17/django-chicp
def _build_exon_query(chrom, segmin, segmax, genes):
    # get exonic structure for genes in this section
    geneExons = dict()
    query_bool = BoolQuery()
    query_bool.must([Query.term("seqid", chrom)])
    if len(genes) > 0:
        for g in genes:
            query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]),
                                               query_bool, sources=utils.snpFields)
            elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000)
            result = elastic.get_result()
            exons = result['data']
            exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons)
            geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start"))
    return geneExons
コード例 #44
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _entrez_ensembl_lookup(cls, gene_sets, section, config=None):
        ''' Get an entrez:ensembl id dictionary. '''
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)
        lookup = {}

        def process_hits(resp_json):
            hits = resp_json['hits']['hits']
            docs = [Document(hit) for hit in hits]
            lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs})

        equery = ElasticQuery.filtered(Query.match_all(),
                                       TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets),
                                       sources=['dbxrefs.ensembl', 'dbxrefs.entrez'])
        ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery)
        return lookup
コード例 #45
0
ファイル: document.py プロジェクト: D-I-L/pydgin
    def get_overlapping_hits(self, build, seqid, start, end):
        query_bool = BoolQuery(must_arr=[RangeQuery("build_info.start", lte=start),
                                         RangeQuery("build_info.end", gte=end)])
        or_filter = OrFilter(RangeQuery("build_info.start", gte=start, lte=end))
        or_filter.extend(RangeQuery("build_info.end", gte=start, lte=end)) \
                 .extend(query_bool)
        range_query = FilteredQuery(BoolQuery(must_arr=[Query.term("build_info.seqid", seqid),
                                                        Query.term("build_info.build", build)]),
                                    or_filter)

        query = ElasticQuery.filtered_bool(
            Query.nested("build_info", range_query),
            BoolQuery(must_arr=[RangeQuery("tier", lte=2)]),
            # sources=["disease", "marker", "chr_band", "tier", "build_info", "disease_locus"]
            )
        elastic = Search(search_query=query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'))
        return elastic.search().docs
コード例 #46
0
    def _check_gene_history(cls, gene_sets, section):
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets))
        docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')

            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)

        return (newgene_ids, discountinued_geneids)
コード例 #47
0
ファイル: views.py プロジェクト: premanand17/django-chicp
def _find_snp_position(snp_track, name):
    mo = re.match(r"(.*)-(.*)", snp_track)
    (group, track) = mo.group(1, 2)
    snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX')
    snp_track_type = ''
    if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track):
        snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \
            .get(snp_track).get('TYPE')
    else:
        snp_track_type = track

    query = ElasticQuery.query_match("name", name)
    elastic = Search(query, idx=snp_track_idx+'/'+snp_track_type)
    snpResult = elastic.get_result()
    if (len(snpResult['data']) > 0):
        chrom = snpResult['data'][0]['seqid'].replace('chr', "")
        position = snpResult['data'][0]['end']
        return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name}
    return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
コード例 #48
0
    def _check_gene_history(cls, gene_sets, config):
        '''find a way to handle this better'''

        section = config['GENE_HISTORY']
        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("discontinued_geneid", gene_sets),
                                      sources=['geneid', 'discontinued_geneid'])
        docs = Search(query, idx=section['index'], idx_type=section['index_type'],
                      size=len(gene_sets)).search().docs

        newgene_ids = {}
        discountinued_geneids = []
        for doc in docs:
            geneid = getattr(doc, 'geneid')
            discontinued_geneid = getattr(doc, 'discontinued_geneid')
            if geneid is None:
                discountinued_geneids.append(str(discontinued_geneid))
            else:
                newgene_ids[str(discontinued_geneid)] = str(geneid)
        return (newgene_ids, discountinued_geneids)
コード例 #49
0
ファイル: views.py プロジェクト: D-I-L/pydgin
def interaction_details(request):
    ''' Get interaction details for a given ensembl ID. '''
    ens_id = request.POST.get('ens_id')
    query = ElasticQuery.has_parent('gene', Query.ids(ens_id))
    elastic = Search(query, idx=ElasticSettings.idx('GENE', 'INTERACTIONS'), size=500)

    interaction_hits = elastic.get_json_response()['hits']
    ens_ids = []
    for hit in interaction_hits['hits']:
        for interactor in hit['_source']['interactors']:
            ens_ids.append(interactor['interactor'])
    docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])
    for hit in interaction_hits['hits']:
        for interactor in hit['_source']['interactors']:
            iid = interactor['interactor']
            try:
                interactor['symbol'] = getattr(docs[iid], 'symbol')
            except KeyError:
                interactor['symbol'] = iid

    return JsonResponse(interaction_hits)
コード例 #50
0
ファイル: views.py プロジェクト: tottlefields/pydgin
def interaction_details(request):
    """ Get interaction details for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    query = ElasticQuery.has_parent("gene", Query.ids(ens_id))
    elastic = Search(query, idx=ElasticSettings.idx("GENE", "INTERACTIONS"), size=500)

    interaction_hits = elastic.get_json_response()["hits"]
    ens_ids = []
    for hit in interaction_hits["hits"]:
        for interactor in hit["_source"]["interactors"]:
            ens_ids.append(interactor["interactor"])
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])
    for hit in interaction_hits["hits"]:
        for interactor in hit["_source"]["interactors"]:
            iid = interactor["interactor"]
            try:
                interactor["symbol"] = getattr(docs[iid], "symbol")
            except KeyError:
                interactor["symbol"] = iid

    return JsonResponse(interaction_hits)
コード例 #51
0
ファイル: gene.py プロジェクト: D-I-L/django-data-pipeline
    def _update_gene(cls, genes, idx):
        ''' Use genes data to update the index. '''
        gene_keys = list(genes.keys())
        chunk_size = 450
        for i in range(0, len(genes), chunk_size):
            chunk_gene_keys = gene_keys[i:i+chunk_size]
            json_data = ''

            query = ElasticQuery.filtered(Query.match_all(),
                                          TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys))
            docs = Search(query, idx=idx, size=chunk_size).search().docs
            for doc in docs:
                ens_id = doc._meta['_id']
                idx_type = doc.type()
                entrez = getattr(doc, 'dbxrefs')['entrez']
                doc_data = {"update": {"_id": ens_id, "_type": idx_type,
                                       "_index": idx, "_retry_on_conflict": 3}}
                json_data += json.dumps(doc_data) + '\n'
                json_data += json.dumps({'doc': genes[entrez]}) + '\n'
            if json_data != '':
                Loader().bulk_load(idx, idx_type, json_data)
コード例 #52
0
ファイル: views.py プロジェクト: tottlefields/pydgin
def studies_details(request):
    """ Get studies for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), sfilter)
    elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500)
    study_hits = elastic.get_json_response()["hits"]

    ens_ids = []
    pmids = []
    for hit in study_hits["hits"]:
        if "pmid" in hit["_source"]:
            pmids.append(hit["_source"]["pmid"])
        for ens_id in hit["_source"]["genes"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])
    pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"])

    for hit in study_hits["hits"]:
        genes = {}
        for ens_id in hit["_source"]["genes"]:
            try:
                genes[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genes = {ens_id: ens_id}
        hit["_source"]["genes"] = genes
        if "pmid" in hit["_source"]:
            pmid = hit["_source"]["pmid"]
            try:
                authors = getattr(pub_docs[pmid], "authors")
                journal = getattr(pub_docs[pmid], "journal")
                hit["_source"]["pmid"] = {
                    "pmid": pmid,
                    "author": authors[0]["name"].rsplit(None, 1)[-1],
                    "journal": journal,
                }
            except KeyError:
                hit["_source"]["pmid"] = {"pmid": pmid}

    return JsonResponse(study_hits)
コード例 #53
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
    def get_new_pmids(cls, pmids, idx, disease_code=None):
        ''' Find PMIDs in a list that are not in the elastic index. '''
        chunk_size = 800
        pmids_found = set()
        pmids_found_add = pmids_found.add
        time.sleep(5)

        for i in range(0, len(pmids), chunk_size):
            pmids_slice = pmids[i:i+chunk_size]
            terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice)
            query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags'])

            docs = Search(query, idx=idx, size=chunk_size).search().docs
            json_data = ''

            for doc in docs:
                pmids_found_add(getattr(doc, 'pmid'))
                if disease_code is not None:
                    tags = getattr(doc, 'tags')
                    if 'disease' in tags:
                        disease = tags['disease']
                    else:
                        disease = []
                    if disease_code not in disease:
                        # update disease attribute
                        disease.append(disease_code)
                        tags['disease'] = disease
                        idx_name = doc._meta['_index']
                        idx_type = doc.type()

                        doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type,
                                               "_index": idx_name, "_retry_on_conflict": 3}}
                        json_data += json.dumps(doc_data) + '\n'
                        json_data += json.dumps({'doc': {'tags': tags}}) + '\n'

            if json_data != '':
                Loader().bulk_load(idx_name, idx_type, json_data)

        return [pmid for pmid in pmids if pmid not in pmids_found]
コード例 #54
0
ファイル: views.py プロジェクト: D-I-L/pydgin
def genesets_details(request):
    ''' Get pathway gene sets for a given ensembl ID. '''
    ens_id = request.POST.get('ens_id')
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500)
    genesets_hits = elastic.get_json_response()['hits']
    ens_ids = []
    for hit in genesets_hits['hits']:
        for ens_id in hit['_source']['gene_sets']:
            ens_ids.append(ens_id)
    docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol'])

    for hit in genesets_hits['hits']:
        genesets = {}
        for ens_id in hit['_source']['gene_sets']:
            try:
                genesets[ens_id] = getattr(docs[ens_id], 'symbol')
            except KeyError:
                genesets[ens_id] = ens_id
        hit['_source']['gene_sets'] = genesets
    return JsonResponse(genesets_hits)
コード例 #55
0
ファイル: views.py プロジェクト: tottlefields/pydgin
def genesets_details(request):
    """ Get pathway gene sets for a given ensembl ID. """
    ens_id = request.POST.get("ens_id")
    geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap())
    query = ElasticQuery.filtered(Query.match_all(), geneset_filter)
    elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500)
    genesets_hits = elastic.get_json_response()["hits"]
    ens_ids = []
    for hit in genesets_hits["hits"]:
        for ens_id in hit["_source"]["gene_sets"]:
            ens_ids.append(ens_id)
    docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"])

    for hit in genesets_hits["hits"]:
        genesets = {}
        for ens_id in hit["_source"]["gene_sets"]:
            try:
                genesets[ens_id] = getattr(docs[ens_id], "symbol")
            except KeyError:
                genesets[ens_id] = ens_id
        hit["_source"]["gene_sets"] = genesets
    return JsonResponse(genesets_hits)
コード例 #56
0
    def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True):
        '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs'''

        # first check in gene_history
        (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section)

        # replace all old ids with new ids
        replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids)

        query = ElasticQuery.filtered(Query.match_all(),
                                      TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets))
        docs = Search(query, idx=section['index'], size=1000000).search().docs
        ensembl_ids = []
        for doc in docs:
            ens_id = doc._meta['_id']
            ensembl_ids.append(ens_id)

        if log_conversion:
            if log_output_file_handler is not None:
                cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler)

        return ensembl_ids
コード例 #57
0
ファイル: views.py プロジェクト: premanand17/django-chicp
def _build_snp_query(snp_track, chrom, segmin, segmax):
    snps = []
    snpMeta = {}
    maxScore = -1
    if snp_track and snp_track != 'None':
        # get SNPs based on this segment
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX')
        snp_track_type = ''
        if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track):
            snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \
                .get(snp_track).get('TYPE')
        else:
            snp_track_type = track

        query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                      Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                      utils.snpFields)
        snpQuery = Search(search_query=query, search_from=0, size=2000000, idx=snp_track_idx+'/'+snp_track_type)

        snpResult = snpQuery.get_result()
        snps = snpResult['data']
        snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps)

        data_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('DATA_TYPE')
        snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type)
#        if 'max' in snpSettings:
#            maxScore = float(snpSettings['max'])
#        else:
        for s in snps:
            if float(s['score']) > maxScore:
                maxScore = float(s['score'])
        snpSettings['max'] = maxScore

        snpMeta = snpSettings

    return snps, snpMeta
コード例 #58
0
        def check_hits(resp_json):
            self.assertTrue('hits' in resp_json, 'scan and scroll hits')
            self.assertGreaterEqual(len(resp_json['hits']['hits']), 1)
            docs = [Document(hit) for hit in resp_json['hits']['hits']]
            for doc1 in docs:
                doc_internal_id = getattr(doc1, "internal_id")
                if doc_internal_id in internal_id:
                    pos1 = self._get_highest_build(doc1)
                    for doc2 in internal_id[doc_internal_id]:
                        pos2 = self._get_highest_build(doc2)
                        if pos2['position'] != pos1['position']:
                            msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" +
                                   str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" +
                                   str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t")
                            try:
                                terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'],
                                                                                      pos2['position']])
                                query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter)
                                elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'))
                                docs_by_pos = elastic.search().docs
                                found = False
                                for d in docs_by_pos:
                                    msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t"
                                    if getattr(d, "id") == 'rs'+str(doc_internal_id):
                                        found = True

                                if not found:
                                    msg += 'rs'+str(doc_internal_id)
                                    if self._rs_exists('rs'+str(doc_internal_id)):
                                        msg += ' EXISTS IN DBSNP\t'
                                    else:
                                        msg += ' NOT IN DBSNP\t'
                                logger.error(msg)
                            except KeyError:
                                logger.error(msg)
                    internal_id[doc_internal_id].append(doc1)
                else:
                    internal_id[doc_internal_id] = [doc1]
コード例 #59
0
ファイル: views.py プロジェクト: D-I-L/django-chicp
def _build_snp_query(snp_track, chrom, segmin, segmax):
    snps = []
    snpMeta = {}
    maxScore = -1
    if snp_track and snp_track != 'None':
        # get SNPs based on this segment
        mo = re.match(r"(.*)-(.*)", snp_track)
        (group, track) = mo.group(1, 2)
        try:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper())
        except SettingsError:
            snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track

        query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]),
                                      Filter(RangeQuery("end", gte=segmin, lte=segmax)),
                                      utils.snpFields)
        snpQuery = Search(search_query=query, search_from=0, size=10000, idx=snp_track_idx)

        # snpResult = snpQuery.get_result()
        # snps = snpResult['data']
        snpResult = snpQuery.get_json_response()
        snps = []
        for hit in snpResult['hits']['hits']:
            snps.append(hit['_source'])
        snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps)

        data_type = ElasticSettings.get_label('CP_STATS_'+group.upper(), None, "data_type")
        snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type)

        for s in snps:
            if float(s['score']) > maxScore:
                maxScore = float(s['score'])
        snpSettings['max'] = maxScore

        snpMeta = snpSettings

    return snps, snpMeta