Example #1
0
 def _es_field_block(self, conn, type, now, max_wait=False):
     q = {
         "query": {
             "term": {"id.exact": self.id}
         },
         "fields": ["last_updated"]
     }
     waited = 0.0
     while True:
         if max_wait is not False and waited >= max_wait:
             break
         res = raw.search(conn, type, q)
         j = raw.unpack_result(res)
         if len(j) == 0:
             time.sleep(0.5)
             waited += 0.5
             continue
         if len(j) > 1:
             raise StoreException("More than one record with id {x}".format(x=self.id))
         if j[0].get("last_updated")[0] == now:  # NOTE: only works on ES > 1.x
             break
         else:
             time.sleep(0.5)
             waited += 0.5
             continue
Example #2
0
def compare_index_counts(conns, types, q=None):
    """ Compare two or more indexes by doc counts of given types. Returns True if all counts equal, False otherwise """
    if q is not None:
        q = q.copy()
        if "size" not in q or q['size'] != 0:
            q["size"] = 0
    if q is None:
        q = {"query": {"match_all": {}}, "size": 0}

    equal_counts = []

    for t in types:
        print "\ntype:", t
        counts = []
        for c in conns:
            resp = raw.search(connection=c, type=t, query=q)
            try:
                count = resp.json()["hits"]["total"]
                counts.append(count)
                print "index {0}: {1}".format(c.index, count)
            except KeyError:
                print resp.json()

        equal_counts.append(reduce(lambda x, y: x == y, counts))

    return reduce(lambda x, y: x and y, equal_counts)
Example #3
0
 def _es_field_block(self, conn, type, now, max_wait=False):
     q = {
         "query": {
             "term": {
                 "id.exact": self.id
             }
         },
         "fields": ["last_updated"]
     }
     waited = 0.0
     while True:
         if max_wait is not False and waited >= max_wait:
             break
         res = raw.search(conn, type, q)
         j = raw.unpack_result(res)
         if len(j) == 0:
             time.sleep(0.5)
             waited += 0.5
             continue
         if len(j) > 1:
             raise StoreException(
                 "More than one record with id {x}".format(x=self.id))
         if j[0].get(
                 "last_updated")[0] == now:  # NOTE: only works on ES > 1.x
             break
         else:
             time.sleep(0.5)
             waited += 0.5
             continue
Example #4
0
    def save(self, conn=None, makeid=True, created=True, updated=True, blocking=False, type=None):
        if conn is None:
            conn = self.__conn__

        type = self.get_write_type(type)

        if blocking and not updated:
            raise StoreException("Unable to do blocking save on record where last_updated is not set")

        now = util.now()
        if blocking:
            # we need the new last_updated time to be later than the new one
            if now == self.last_updated:
                time.sleep(1)   # timestamp granularity is seconds, so just sleep for 1
            now = util.now()    # update the new timestamp

        # the main body of the save
        if makeid:
            if "id" not in self.data:
                self.id = self.makeid()
        if created:
            if 'created_date' not in self.data:
                self.data['created_date'] = now
        if updated:
            self.data['last_updated'] = now

        raw.store(conn, type, self.data, self.id)

        if blocking:
            q = {
                "query" : {
                    "term" : {"id.exact" : self.id}
                },
                "fields" : ["last_updated"]
            }
            while True:
                res = raw.search(conn, type, q)
                j = raw.unpack_result(res)
                if len(j) == 0:
                    time.sleep(0.5)
                    continue
                if len(j) > 1:
                    raise StoreException("More than one record with id {x}".format(x=self.id))
                if j[0].get("last_updated")[0] == now:  # NOTE: only works on ES > 1.x
                    break
                else:
                    time.sleep(0.5)
                    continue
Example #5
0
def iterate(conn, type, q, page_size=1000, limit=None, method="POST", keyword_subfield="exact"):
    q = q.copy()
    q["size"] = page_size
    q["from"] = 0
    if "sort" not in q: # to ensure complete coverage on a changing index, sort by id is our best bet
        q["sort"] = [{"id." + keyword_subfield : {"order" : "asc"}}]
    counter = 0
    while True:
        # apply the limit
        if limit is not None and counter >= int(limit):
            break
        
        res = raw.search(conn, type=type, query=q, method=method)
        rs = raw.unpack_result(res)
        
        if len(rs) == 0:
            break
        for r in rs:
            # apply the limit (again)
            if limit is not None and counter >= int(limit):
                break
            counter += 1
            yield r
        q["from"] += page_size
Example #6
0
def iterate(conn, type, q, page_size=1000, limit=None, method="POST"):
    q = q.copy()
    q["size"] = page_size
    q["from"] = 0
    if "sort" not in q: # to ensure complete coverage on a changing index, sort by id is our best bet
        q["sort"] = [{"id" : {"order" : "asc"}}]
    counter = 0
    while True:
        # apply the limit
        if limit is not None and counter >= int(limit):
            break
        
        res = raw.search(conn, type=type, query=q, method=method)
        rs = raw.unpack_result(res)
        
        if len(rs) == 0:
            break
        for r in rs:
            # apply the limit (again)
            if limit is not None and counter >= int(limit):
                break
            counter += 1
            yield r
        q["from"] += page_size
Example #7
0
    def save(self,
             conn=None,
             makeid=True,
             created=True,
             updated=True,
             blocking=False,
             type=None):
        if conn is None:
            conn = self._get_connection()

        if type is None:
            type = self._get_write_type()

        if blocking and not updated:
            raise StoreException(
                "Unable to do blocking save on record where last_updated is not set"
            )

        now = util.now()
        if blocking:
            # we need the new last_updated time to be later than the new one
            if now == self.last_updated:
                time.sleep(
                    1)  # timestamp granularity is seconds, so just sleep for 1
            now = util.now()  # update the new timestamp

        # the main body of the save
        if makeid:
            if "id" not in self.data:
                self.id = self.makeid()
        if created:
            if 'created_date' not in self.data:
                self.data['created_date'] = now
        if updated:
            self.data['last_updated'] = now

        raw.store(conn, type, self.data, self.id)

        if blocking:
            q = {
                "query": {
                    "term": {
                        "id.exact": self.id
                    }
                },
                "fields": ["last_updated"]
            }
            while True:
                res = raw.search(conn, type, q)
                j = raw.unpack_result(res)
                if len(j) == 0:
                    time.sleep(0.5)
                    continue
                if len(j) > 1:
                    raise StoreException(
                        "More than one record with id {x}".format(x=self.id))
                if j[0].get("last_updated"
                            )[0] == now:  # NOTE: only works on ES > 1.x
                    break
                else:
                    time.sleep(0.5)
                    continue
Example #8
0
    def query(cls,
              q='',
              terms=None,
              should_terms=None,
              facets=None,
              conn=None,
              types=None,
              **kwargs):
        '''Perform a query on backend.

        :param q: maps to query_string parameter if string, or query dict if dict.
        :param terms: dictionary of terms to filter on. values should be lists. 
        :param facets: dict of facets to return from the query.
        :param kwargs: any keyword args as per
            http://www.elasticsearch.org/guide/reference/api/search/uri-request.html
        '''
        if conn is None:
            conn = cls.__conn__

        types = cls.get_read_types(types)

        if isinstance(q, dict):
            query = q
            if 'bool' not in query['query']:
                boolean = {'bool': {'must': []}}
                boolean['bool']['must'].append(query['query'])
                query['query'] = boolean
            if 'must' not in query['query']['bool']:
                query['query']['bool']['must'] = []
        elif q:
            query = {
                'query': {
                    'bool': {
                        'must': [{
                            'query_string': {
                                'query': q
                            }
                        }]
                    }
                }
            }
        else:
            query = {'query': {'bool': {'must': [{'match_all': {}}]}}}

        if facets:
            if 'facets' not in query:
                query['facets'] = {}
            for k, v in facets.items():
                query['facets'][k] = {"terms": v}

        if terms:
            boolean = {'must': []}
            for term in terms:
                if not isinstance(terms[term], list):
                    terms[term] = [terms[term]]
                for val in terms[term]:
                    obj = {'term': {}}
                    obj['term'][term] = val
                    boolean['must'].append(obj)
            if q and not isinstance(q, dict):
                boolean['must'].append({'query_string': {'query': q}})
            elif q and 'query' in q:
                boolean['must'].append(query['query'])
            query['query'] = {'bool': boolean}

        for k, v in kwargs.items():
            if k == '_from':
                query['from'] = v
            else:
                query[k] = v

        if should_terms is not None and len(should_terms) > 0:
            for s in should_terms:
                if not isinstance(should_terms[s], list):
                    should_terms[s] = [should_terms[s]]
                query["query"]["bool"]["must"].append(
                    {"terms": {
                        s: should_terms[s]
                    }})

        r = raw.search(conn, types, query)
        return r.json()
Example #9
0
    def query(cls, q='', terms=None, should_terms=None, facets=None, conn=None, types=None, **kwargs):
        """ Perform a query on backend.

        :param q: maps to query_string parameter if string, or query dict if dict.
        :param terms: dictionary of terms to filter on. values should be lists. 
        :param facets: dict of facets to return from the query.
        :param kwargs: any keyword args as per
            http://www.elasticsearch.org/guide/reference/api/search/uri-request.html
        """
        if conn is None:
            conn = cls.__conn__

        types = cls.get_read_types(types)
        
        if isinstance(q, dict):
            query = q
            if 'bool' not in query['query']:
                boolean = {'bool': {'must': []}}
                boolean['bool']['must'].append(query['query'])
                query['query'] = boolean
            if 'must' not in query['query']['bool']:
                query['query']['bool']['must'] = []
        elif q:
            query = {
                'query': {
                    'bool': {
                        'must': [
                            {'query_string': {'query': q}}
                        ]
                    }
                }
            }
        else:
            query = {
                'query': {
                    'bool': {
                        'must': [
                            {'match_all': {}}
                        ]
                    }
                }
            }

        if facets:
            if 'facets' not in query:
                query['facets'] = {}
            for k, v in facets.items():
                query['facets'][k] = {"terms": v}

        if terms:
            boolean = {'must': []}
            for term in terms:
                if not isinstance(terms[term], list):
                    terms[term] = [terms[term]]
                for val in terms[term]:
                    obj = {'term': {}}
                    obj['term'][term] = val
                    boolean['must'].append(obj)
            if q and not isinstance(q, dict):
                boolean['must'].append({'query_string': {'query': q}})
            elif q and 'query' in q:
                boolean['must'].append(query['query'])
            query['query'] = {'bool': boolean}

        for k, v in kwargs.items():
            if k == '_from':
                query['from'] = v
            else:
                query[k] = v

        if should_terms is not None and len(should_terms) > 0:
            for s in should_terms:
                if not isinstance(should_terms[s], list):
                    should_terms[s] = [should_terms[s]]
                query["query"]["bool"]["must"].append({"terms": {s: should_terms[s]}})

        r = raw.search(conn, types, query)
        return r.json()
Example #10
0
from esprit import tasks, raw

'''Delete all rejected applications in DOAJ'''

# Connection to the ES index
conn = raw.make_connection(None, 'localhost', 9200, 'doaj')

rej_query = {
            "query" : {
                "term" : {
                    "admin.application_status.exact" : "rejected"
                    }
                }
            }

json_writer = tasks.JSONListWriter('rejected_applications.json')

# Dump all rejected suggestions to file
tasks.dump(conn, 'suggestion', q=rej_query.copy(), out=json_writer)
json_writer.close()

# Ask how many rejected applications will be deleted.
n_deleted = raw.search(conn, 'suggestion', rej_query).json()['hits']['total']

# Delete all rejected suggestions.
raw.delete_by_query(conn, 'suggestion', rej_query)

print "\n{0} suggestions archived to file 'rejected_applications.json' and deleted from index.".format(n_deleted)