def _es_field_block(self, conn, type, now, max_wait=False): q = { "query": { "term": {"id.exact": self.id} }, "fields": ["last_updated"] } waited = 0.0 while True: if max_wait is not False and waited >= max_wait: break res = raw.search(conn, type, q) j = raw.unpack_result(res) if len(j) == 0: time.sleep(0.5) waited += 0.5 continue if len(j) > 1: raise StoreException("More than one record with id {x}".format(x=self.id)) if j[0].get("last_updated")[0] == now: # NOTE: only works on ES > 1.x break else: time.sleep(0.5) waited += 0.5 continue
def compare_index_counts(conns, types, q=None): """ Compare two or more indexes by doc counts of given types. Returns True if all counts equal, False otherwise """ if q is not None: q = q.copy() if "size" not in q or q['size'] != 0: q["size"] = 0 if q is None: q = {"query": {"match_all": {}}, "size": 0} equal_counts = [] for t in types: print "\ntype:", t counts = [] for c in conns: resp = raw.search(connection=c, type=t, query=q) try: count = resp.json()["hits"]["total"] counts.append(count) print "index {0}: {1}".format(c.index, count) except KeyError: print resp.json() equal_counts.append(reduce(lambda x, y: x == y, counts)) return reduce(lambda x, y: x and y, equal_counts)
def _es_field_block(self, conn, type, now, max_wait=False): q = { "query": { "term": { "id.exact": self.id } }, "fields": ["last_updated"] } waited = 0.0 while True: if max_wait is not False and waited >= max_wait: break res = raw.search(conn, type, q) j = raw.unpack_result(res) if len(j) == 0: time.sleep(0.5) waited += 0.5 continue if len(j) > 1: raise StoreException( "More than one record with id {x}".format(x=self.id)) if j[0].get( "last_updated")[0] == now: # NOTE: only works on ES > 1.x break else: time.sleep(0.5) waited += 0.5 continue
def save(self, conn=None, makeid=True, created=True, updated=True, blocking=False, type=None): if conn is None: conn = self.__conn__ type = self.get_write_type(type) if blocking and not updated: raise StoreException("Unable to do blocking save on record where last_updated is not set") now = util.now() if blocking: # we need the new last_updated time to be later than the new one if now == self.last_updated: time.sleep(1) # timestamp granularity is seconds, so just sleep for 1 now = util.now() # update the new timestamp # the main body of the save if makeid: if "id" not in self.data: self.id = self.makeid() if created: if 'created_date' not in self.data: self.data['created_date'] = now if updated: self.data['last_updated'] = now raw.store(conn, type, self.data, self.id) if blocking: q = { "query" : { "term" : {"id.exact" : self.id} }, "fields" : ["last_updated"] } while True: res = raw.search(conn, type, q) j = raw.unpack_result(res) if len(j) == 0: time.sleep(0.5) continue if len(j) > 1: raise StoreException("More than one record with id {x}".format(x=self.id)) if j[0].get("last_updated")[0] == now: # NOTE: only works on ES > 1.x break else: time.sleep(0.5) continue
def iterate(conn, type, q, page_size=1000, limit=None, method="POST", keyword_subfield="exact"): q = q.copy() q["size"] = page_size q["from"] = 0 if "sort" not in q: # to ensure complete coverage on a changing index, sort by id is our best bet q["sort"] = [{"id." + keyword_subfield : {"order" : "asc"}}] counter = 0 while True: # apply the limit if limit is not None and counter >= int(limit): break res = raw.search(conn, type=type, query=q, method=method) rs = raw.unpack_result(res) if len(rs) == 0: break for r in rs: # apply the limit (again) if limit is not None and counter >= int(limit): break counter += 1 yield r q["from"] += page_size
def iterate(conn, type, q, page_size=1000, limit=None, method="POST"): q = q.copy() q["size"] = page_size q["from"] = 0 if "sort" not in q: # to ensure complete coverage on a changing index, sort by id is our best bet q["sort"] = [{"id" : {"order" : "asc"}}] counter = 0 while True: # apply the limit if limit is not None and counter >= int(limit): break res = raw.search(conn, type=type, query=q, method=method) rs = raw.unpack_result(res) if len(rs) == 0: break for r in rs: # apply the limit (again) if limit is not None and counter >= int(limit): break counter += 1 yield r q["from"] += page_size
def save(self, conn=None, makeid=True, created=True, updated=True, blocking=False, type=None): if conn is None: conn = self._get_connection() if type is None: type = self._get_write_type() if blocking and not updated: raise StoreException( "Unable to do blocking save on record where last_updated is not set" ) now = util.now() if blocking: # we need the new last_updated time to be later than the new one if now == self.last_updated: time.sleep( 1) # timestamp granularity is seconds, so just sleep for 1 now = util.now() # update the new timestamp # the main body of the save if makeid: if "id" not in self.data: self.id = self.makeid() if created: if 'created_date' not in self.data: self.data['created_date'] = now if updated: self.data['last_updated'] = now raw.store(conn, type, self.data, self.id) if blocking: q = { "query": { "term": { "id.exact": self.id } }, "fields": ["last_updated"] } while True: res = raw.search(conn, type, q) j = raw.unpack_result(res) if len(j) == 0: time.sleep(0.5) continue if len(j) > 1: raise StoreException( "More than one record with id {x}".format(x=self.id)) if j[0].get("last_updated" )[0] == now: # NOTE: only works on ES > 1.x break else: time.sleep(0.5) continue
def query(cls, q='', terms=None, should_terms=None, facets=None, conn=None, types=None, **kwargs): '''Perform a query on backend. :param q: maps to query_string parameter if string, or query dict if dict. :param terms: dictionary of terms to filter on. values should be lists. :param facets: dict of facets to return from the query. :param kwargs: any keyword args as per http://www.elasticsearch.org/guide/reference/api/search/uri-request.html ''' if conn is None: conn = cls.__conn__ types = cls.get_read_types(types) if isinstance(q, dict): query = q if 'bool' not in query['query']: boolean = {'bool': {'must': []}} boolean['bool']['must'].append(query['query']) query['query'] = boolean if 'must' not in query['query']['bool']: query['query']['bool']['must'] = [] elif q: query = { 'query': { 'bool': { 'must': [{ 'query_string': { 'query': q } }] } } } else: query = {'query': {'bool': {'must': [{'match_all': {}}]}}} if facets: if 'facets' not in query: query['facets'] = {} for k, v in facets.items(): query['facets'][k] = {"terms": v} if terms: boolean = {'must': []} for term in terms: if not isinstance(terms[term], list): terms[term] = [terms[term]] for val in terms[term]: obj = {'term': {}} obj['term'][term] = val boolean['must'].append(obj) if q and not isinstance(q, dict): boolean['must'].append({'query_string': {'query': q}}) elif q and 'query' in q: boolean['must'].append(query['query']) query['query'] = {'bool': boolean} for k, v in kwargs.items(): if k == '_from': query['from'] = v else: query[k] = v if should_terms is not None and len(should_terms) > 0: for s in should_terms: if not isinstance(should_terms[s], list): should_terms[s] = [should_terms[s]] query["query"]["bool"]["must"].append( {"terms": { s: should_terms[s] }}) r = raw.search(conn, types, query) return r.json()
def query(cls, q='', terms=None, should_terms=None, facets=None, conn=None, types=None, **kwargs): """ Perform a query on backend. :param q: maps to query_string parameter if string, or query dict if dict. :param terms: dictionary of terms to filter on. values should be lists. :param facets: dict of facets to return from the query. :param kwargs: any keyword args as per http://www.elasticsearch.org/guide/reference/api/search/uri-request.html """ if conn is None: conn = cls.__conn__ types = cls.get_read_types(types) if isinstance(q, dict): query = q if 'bool' not in query['query']: boolean = {'bool': {'must': []}} boolean['bool']['must'].append(query['query']) query['query'] = boolean if 'must' not in query['query']['bool']: query['query']['bool']['must'] = [] elif q: query = { 'query': { 'bool': { 'must': [ {'query_string': {'query': q}} ] } } } else: query = { 'query': { 'bool': { 'must': [ {'match_all': {}} ] } } } if facets: if 'facets' not in query: query['facets'] = {} for k, v in facets.items(): query['facets'][k] = {"terms": v} if terms: boolean = {'must': []} for term in terms: if not isinstance(terms[term], list): terms[term] = [terms[term]] for val in terms[term]: obj = {'term': {}} obj['term'][term] = val boolean['must'].append(obj) if q and not isinstance(q, dict): boolean['must'].append({'query_string': {'query': q}}) elif q and 'query' in q: boolean['must'].append(query['query']) query['query'] = {'bool': boolean} for k, v in kwargs.items(): if k == '_from': query['from'] = v else: query[k] = v if should_terms is not None and len(should_terms) > 0: for s in should_terms: if not isinstance(should_terms[s], list): should_terms[s] = [should_terms[s]] query["query"]["bool"]["must"].append({"terms": {s: should_terms[s]}}) r = raw.search(conn, types, query) return r.json()
from esprit import tasks, raw '''Delete all rejected applications in DOAJ''' # Connection to the ES index conn = raw.make_connection(None, 'localhost', 9200, 'doaj') rej_query = { "query" : { "term" : { "admin.application_status.exact" : "rejected" } } } json_writer = tasks.JSONListWriter('rejected_applications.json') # Dump all rejected suggestions to file tasks.dump(conn, 'suggestion', q=rej_query.copy(), out=json_writer) json_writer.close() # Ask how many rejected applications will be deleted. n_deleted = raw.search(conn, 'suggestion', rej_query).json()['hits']['total'] # Delete all rejected suggestions. raw.delete_by_query(conn, 'suggestion', rej_query) print "\n{0} suggestions archived to file 'rejected_applications.json' and deleted from index.".format(n_deleted)