def codsearch_composition(composition, tolerance): from diffpy.pdfgetx.functs import composition_analysis from elasticsearch import Elasticsearch from elasticsearch.helpers import scan es = Elasticsearch(ELASTICHOST) smbls, counts = composition_analysis(composition) if tolerance == 0: mustterms = [{ 'term': { ("composition." + s): c } } for s, c in zip(smbls, counts)] q = {"bool": {"must": mustterms}} else: rangeterms = [{ 'range': { ("composition." + s): { "gte": c - tolerance, "lte": c + tolerance, } } } for s, c in zip(smbls, counts)] q = {"bool": {"must": rangeterms}} gscan = scan(es, query={'query': q}, index='cod', doc_type='cif', _source=False) for e in gscan: codid = normcodid(e['_id']) yield codid pass
def normalized_formula(formula): from diffpy.pdfgetx.functs import composition_analysis smbls, counts = composition_analysis(formula) totalcount = sum(counts) rv = dict.fromkeys(smbls, 0.0) for s, c in zip(smbls, counts): rv[s] += c for s in rv: rv[s] /= totalcount return rv
def cifsearch(q=None, composition=None, tol=None, fields=None, **kwargs): """ Execute search for CIF structures using Lucene query string syntax. Parameters ---------- q : str, optional The string search query in Lucene syntax. query : dict, optional, keyword-only The search definition using the Query DSL. composition : str, optional Normalized chemical stoichiometry to be matched. tol : float, optional Maximum allowed difference from stoichiometry. fields : list or str, optional Name of CIF fields to be returned. kwargs : misc, optional Extra arguments passed to the `Elasticsearch.search` function. Returns ------- databroker.Results Iterable object encapsulating the matching databroker Headers. """ from diffpy.pdfgetx.functs import composition_analysis from elasticsearch import Elasticsearch es = Elasticsearch(ELASTICHOST) kw = dict(q=q, index='cod') if 'query' in kwargs: kw['body'] = kwargs.pop('query') if composition: smbls, counts = composition_analysis(composition) if not tol: mustterms = [{ 'term': { ("composition." + s): c } } for s, c in zip(smbls, counts)] cq = {"bool": {"must": mustterms}} else: rangeterms = [{ 'range': { ("composition." + s): { "gte": c - tol, "lte": c + tol, } } } for s, c in zip(smbls, counts)] cq = {"bool": {"must": rangeterms}} kw['body'] = {'query': cq} kw.update(**kwargs) if isinstance(fields, str): fields = fields.replace(',', ' ').split() if fields: kw['_source'] = fields res = es.search(**kw) rv = res if fields: rv = [ tuple(hit['_source'].get(n) for n in fields) for hit in res['hits']['hits'] ] if len(fields) == 1: rv = [x[0] for x in rv] return rv