Example #1
0
class ESPipeline(object):
    def __init__(self, *args, **kwargs):
        self.client = ElasticSearch('http://localhost:9200/')

    def process_item(self, item, spider):
        self.client.index('wiki', 'page', dict(item))
        return item
    def set_in_index(self, documentList):
        """
        Store the list of documents in the Elasticsearch index via HTTP APIs

        @type  documentList: List
        @param documentList: List of image layer JSON documents
        """
        #Get the Elasticsearch address from the config file
        cfg = config.load()

        #Store the document list in Elasticsearch
        es = ElasticSearch(cfg.search_options.get("address"))
        try:
            es.bulk_index(cfg.search_options.get("index"),
                          cfg.search_options.get("type"),
                          documentList,
                          id_field='id')
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except Timeout:
            logger.debug("Timeout!")
        except ConnectionError:
            logger.debug("ConnectionError!")
        except ElasticHttpError:
            logger.debug("ElasticHttpError!")
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except ElasticHttpNotFoundError:
            logger.debug("ElasticHttpNotFoundError!")
Example #3
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type)
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        return hits['hits']
Example #4
0
class ESLayers(object):
    """Implementation of Elastic Search as layers backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def _transform(self, layer, version, layer_name):
        """Add some meta data fields which are ES specific"""
        layer = dict(layer)  # copy
        label = layer['label']
        del layer['label']
        return {
            'id': '%s/%s/%s' % (version, layer_name, label),
            'version': version,
            'name': layer_name,
            'label': label,
            'layer': layer
        }

    def bulk_put(self, layers, version, layer_name, root_label):
        """Store all layer objects"""
        self.es.bulk_index(
            settings.ELASTIC_SEARCH_INDEX, 'layer',
            map(lambda l: self._transform(l, version, layer_name), layers))

    def get(self, name, label, version):
        """Find the layer that matches these parameters"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer',
                                 version + '/' + name + '/' + label)

            return result['_source']['layer']
        except ElasticHttpNotFoundError:
            return None
Example #5
0
    def __init__(self,
                 es_url='http://localhost:9200/',
                 batch_size=10,
                 **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))

        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(
            set)  #track urls to be deleted before committing new content
        self.batches = defaultdict(list)  #site: [list of docs]
Example #6
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']
        
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields" : [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #7
0
class ESLayers(object):
    """Implementation of Elastic Search as layers backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def _transform(self, layer, version, layer_name):
        """Add some meta data fields which are ES specific"""
        layer = dict(layer)     # copy
        label = layer['label']
        del layer['label']
        return {
            'id': '%s/%s/%s' % (version, layer_name, label),
            'version': version,
            'name': layer_name,
            'label': label,
            'layer': layer
        }

    def bulk_put(self, layers, version, layer_name, root_label):
        """Store all layer objects"""
        self.es.bulk_index(
            settings.ELASTIC_SEARCH_INDEX, 'layer',
            map(lambda l: self._transform(l, version, layer_name),
                layers))

    def get(self, name, label, version):
        """Find the layer that matches these parameters"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer',
                                 version + '/' + name + '/' + label)

            return result['_source']['layer']
        except ElasticHttpNotFoundError:
            return None
Example #8
0
class ESDiffs(object):
    """Implementation of Elastic Search as diff backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    @staticmethod
    def to_id(label, old, new):
        return "%s/%s/%s" % (label, old, new)

    def put(self, label, old_version, new_version, diff):
        """Store a diff between two versions of a regulation node"""
        struct = {
            'label': label,
            'old_version': old_version,
            'new_version': new_version,
            'diff': diff
        }
        self.es.index(settings.ELASTIC_SEARCH_INDEX, 'diff', struct,
                      id=self.to_id(label, old_version, new_version))

    def get(self, label, old_version, new_version):
        """Find the associated diff"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff',
                                 self.to_id(label, old_version, new_version))
            return result['_source']['diff']
        except ElasticHttpNotFoundError:
            return None
Example #9
0
def get_image(url):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
Example #10
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #11
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "match": {
                    field: {
                        "query": ' '.join(queryStr),
                        "minimum_should_match": "100%"
                    }
                }
            },
            "fields": ["url"]
        }
        print query
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
                        size=500)

        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #12
0
class ESNotices(object):
    """Implementation of Elastic Search as notice backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def put(self, doc_number, notice):
        """Store a single notice"""
        self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice,
                      id=doc_number)

    def get(self, doc_number):
        """Find the associated notice"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice',
                                 doc_number)

            return result['_source']
        except ElasticHttpNotFoundError:
            return None

    def listing(self, part=None):
        """All notices or filtered by cfr_part"""
        if part:
            query = {'match': {'cfr_part': part}}
        else:
            query = {'match_all': {}}
        query = {'fields': ['effective_on', 'fr_url', 'publication_date'],
                 'query': query}
        notices = []
        results = self.es.search(query, doc_type='notice', size=100,
                                 index=settings.ELASTIC_SEARCH_INDEX)
        for notice in results['hits']['hits']:
            notice['fields']['document_number'] = notice['_id']
            notices.append(notice['fields'])
        return notices
    def set_in_index(self, documentList):
        """
        Store the list of documents in the Elasticsearch index via HTTP APIs

        @type  documentList: List
        @param documentList: List of image layer JSON documents
        """
        #Get the Elasticsearch address from the config file
        cfg = config.load()

        #Store the document list in Elasticsearch
        es = ElasticSearch(cfg.search_options.get("address"))
        try:
            es.bulk_index(cfg.search_options.get("index"), cfg.search_options.get("type"), documentList, id_field='id')
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except Timeout:
            logger.debug("Timeout!")
        except ConnectionError:
            logger.debug("ConnectionError!")
        except ElasticHttpError:
            logger.debug("ElasticHttpError!")
        except InvalidJsonResponseError:
            logger.debug("InvalidJsonResponseError!")
        except ElasticHttpNotFoundError:
            logger.debug("ElasticHttpNotFoundError!")            
Example #14
0
def get_image(url):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query, 
                        index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch('http://localhost:9200/')

    if len(terms) > 0:
        results = {}

        for term in terms:
            query = {
                "query": {
                    "term": {
                        term_field: term
                    }
                },
                "fields": fields
            }
        
            res = es.search(query, 
                            index=es_index,
                            doc_type=es_doc_type)

            if res['hits']['hits']:
                hits = res['hits']['hits'][0]

                if not hits.get('fields') is None:
                    hits = hits['fields']
                    record = {}
                    for field in fields:
                        if(not hits.get(field) is None):
                            record[field] = hits[field][0]
                    results[term] = record           
            
    return results
    def _get(self):
        """Build and run the ES query
        """
        opts = self.opts

        es = ElasticSearch(opts.url)
        query = {'sort': {'@timestamp': 'desc'},
                 'size': 1}

        if opts.query:
            query['query'] = {
                'filtered': {
                    'query': {
                        'query_string': {
                            'query': opts.query
                        }
                    }
                }
            }

        # ElasticSearch allows us to pass an array of indices. However,
        # it will throw an exception if any of these don't exist. This
        # isn't the right behavior, because there may not actually be
        # a logstash index from X days ago. Instead, we need to iterate
        # through the daily log indexes in reverse order until we get a
        # non-error response.
        result = None
        for index in self._indexes():
            try:
                result = es.search(query, index=index)
                break
            except ElasticHttpNotFoundError, e:
                pass
class ElasticSearchBackend(BaseBackend):
    
    def __init__(self, es_url='http://localhost:9200/', batch_size=10, **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))
            
        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(set) #track urls to be deleted before committing new content
        self.batches = defaultdict(list) #site: [list of docs]
    
    def create_index(self, name):
        name = name.lower()
        try:
            self.es.create_index(name)
            self.update_mapping(name)
        except Exception, e:
            print e
            return
Example #18
0
def main():
    """
    Method to kick things off
    """

    # Setup workers
    pool = Pool(processes=CPU_COUNT)

    # Prepare URLs
    urls = []
    for url in CRAWL_URLS:
        urls.append(str(BASE_URL + url))

    if USE_ES:
        # Create connection
        es = ElasticSearch(ES_URL)

        try:
            # Delete the existing index
            es.delete_index(ES_INDEX)
        except:
            # In case the index does not exist
            pass

        # Create the index to use
        es.create_index(ES_INDEX)

    else:
        # Setup the database tables, connect
        init_db()

    # Scrape and store async
    pool.map(scrape, urls)
def main():
    #Train the Naive Bayes Classifier
    f=open('./data_set/naivebayes_trained_model.pickle')
    NBClassifier=pickle.load(f)

    #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store
    #the contents of the tweet file in that Index
    
    es=ElasticSearch('http://localhost:9200/')
    es_indexer()
    ############Indexing into Elasticsearch############
    i=0
    for each in tweet_data():
        i+=1
        testTweet= each
        processedTestTweet=process_tweet(testTweet)
        sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet)))
    
        
        es.index("sentiment_analysis","document",{
                     "text": testTweet,
                     "sentiment": sentiment
                         },id=i)
    print "Indexing completed."

    es.refresh(index="sentiment_analysis")
    print "Index refreshed."

    f.close()
Example #20
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query" : {
                "match": {
                    field: {
                        "query": queryStr,
                        "operator" : "and"
                        }
                    }
                },
            "fields": ["url"]
            }
        print query
        res = es.search(query, index='memex', doc_type='page', size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #21
0
def get_documents(urls):
    host =  environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'http://localhost:9200'
    es = ElasticSearch(host)
        
    if len(urls) > 0:
        results = {}

        for url in urls:
            query = {
                "query": {
                    "term": {
                        "url": url
                    }
                },
                "fields": ["text"]
            }
        
            res = es.search(query, 
                            index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                            doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
            hits = res['hits']
            try:
                results[url] = hits['hits'][0]['fields']['text'][0]
            except KeyError, e:
                print url, e, " not found in database"
            except IndexError, e:
                print url, e, " not found in database"
Example #22
0
def get_image(url, output_path=""):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if output_path:
        output_path = output_path+'/'

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail"]
        }
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        if (len(hits) > 0):
            img = base64.b64decode(hits['hits'][0]['fields']['thumbnail'][0])
            with open(output_path+urllib2.quote(url).replace("/", "%2F")+'.png','wb') as f:
                f.write(img)
        else:
            print "No thumbnail found"
def range(field, from_val, to_val, ret_fields=[], epoch=None, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if not (epoch is None):
        if epoch:
            from_val = datetime.utcfromtimestamp(long(from_val)).strftime('%Y-%m-%dT%H:%M:%S')
            to_val = datetime.utcfromtimestamp(long(to_val)).strftime('%Y-%m-%dT%H:%M:%S')
            
    query = { 
        "query" : { 
            "range" : { 
                field : {
                    "from": from_val,
                    "to": to_val
                }
            },
        },
        "fields": ret_fields
    }

    res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
    hits = res['hits']['hits']

    results=[]
    for hit in hits:
        results.append(hit['fields'])

    return results
Example #24
0
def search_posts_elasticsearch(token):
    es = ElasticSearch('http://localhost:9200/')

    #for result in es.search("_type:post", index=token.lower())['hits']['hits']:
    #    print result["_source"]

    print es.search("id:sdifhsdihf", index="caacedeose0cban4zbmltsbcyxgzbzfrvq7uiqksk1uxep0njzgza7jtxei59ekp1izcjbg9czbum5qm0ojjuekaa3vwnn8tnxezcplgyaa2esvpi1dzcycai6xyvfwbrzco8quwns9orejsbecktw738yglnevljlqeascfgdfc0xdrjc1s0n40uun4ypytklsjarzand9gtfazdzd")
Example #25
0
def analyze_post(token, text):
    response = {
        'post_now': False,
        'hours_to_wait': 1,
        'total_score': 0,
        'time_score': 0,
        'text_score': 0,
        'hint': "Building index",
    }

    try:
        data = Newsfeed.filter_only_posts_by_people(token)

    except Exception, e:
        es = ElasticSearch('http://localhost:9200/')

        try:
            es.create_index(token.lower())
            Newsfeed.newsfeed(token, [], 0, None, 1)

            t = threading.Thread(target=Newsfeed.newsfeed, args=(token, [], 0, None, 1500))
            t.setDaemon(True)
            t.start()

        except Exception, e:
            print e.message
Example #26
0
def get_context(terms):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(terms) > 0:

        query = {
            "query": { 
                "match": {
                    "text": {
                        "query": ' and  '.join(terms[0:]),
                        "operator" : "and"
                    }
                }
             },
            "highlight" : {
                "fields" : {
                    "text": {
                        "fragment_size" : 100, "number_of_fragments" : 1
                    }
                }
            }
        }
        print query
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        highlights = []
        for hit in hits['hits']:
            highlights.append(hit['highlight']['text'])
        return highlights
Example #27
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query" : {
                "match": {
                    field: {
                        "query": ' '.join(queryStr),
                        "minimum_should_match":"100%"
                    }
                }
            },
            "fields": ["url"]
        }
        print query
        res = es.search(query, 
                        index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
                        size=500)

        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #28
0
def _check_es_health(product, env):
    if product is not None:
        hosts = [_url_for_host(env)]
    else:
        logging.info(
            "No product specified ; Checking health of all Elasticsearch hosts for env '%s'\n"
            % env)
        all_hosts = set(product_host.values())
        hosts = []
        for host in all_hosts:
            hosts.append(_url_for_host(env, host))

    es = ElasticSearch(hosts, port=port)

    # Add check on elasticsearch health
    health = es.health()

    if health['status'] == 'red':
        logging.error(
            "Elasticsearch status is red. Search will hang. Exiting\n")
        sys.exit(-1)
    elif health['status'] == 'yellow':
        logging.warning(
            'Elasticsearch status is yellow. Search quality will be degraded\n'
        )
Example #29
0
class ESDiffs(object):
    """Implementation of Elastic Search as diff backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    @staticmethod
    def to_id(label, old, new):
        return "%s/%s/%s" % (label, old, new)

    def put(self, label, old_version, new_version, diff):
        """Store a diff between two versions of a regulation node"""
        struct = {
            'label': label,
            'old_version': old_version,
            'new_version': new_version,
            'diff': diff
        }
        self.es.index(settings.ELASTIC_SEARCH_INDEX,
                      'diff',
                      struct,
                      id=self.to_id(label, old_version, new_version))

    def get(self, label, old_version, new_version):
        """Find the associated diff"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff',
                                 self.to_id(label, old_version, new_version))
            return result['_source']['diff']
        except ElasticHttpNotFoundError:
            return None
Example #30
0
def search(request, doc_type, search_args):
    """Search elastic search for any matches in the node's text"""
    query = {
        'fields': ['text', 'label', 'version', 'regulation', 'title',
                   'label_string'],
        'from': search_args.page * search_args.page_size,
        'size': search_args.page_size,
    }
    text_match = {'match': {'text': search_args.q, 'doc_type': doc_type}}
    if search_args.version or search_args.regulation:
        term = {}
        if search_args.version:
            term['version'] = search_args.version
        if search_args.regulation:
            term['regulation'] = search_args.regulation
        if search_args.is_root is not None:
            term['is_root'] = search_args.is_root
        if search_args.is_subpart is not None:
            term['is_subpart'] = search_args.is_subpart
        query['query'] = {'filtered': {
            'query': text_match,
            'filter': {'term': term}
        }}
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits': results['hits']['total'],
        'results': transform_results([h['fields'] for h in
                                      results['hits']['hits']])
    })
def get_image(url, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
Example #32
0
    def search(q):
        """
        Implement search method with ElasticSearch
        """

        # Create connection
        es = ElasticSearch(ES_URL)

        # Get results from index
        results = es.search(
            {
                "query": {
                    "query_string": {
                        "query": q
                    }
                }
            },
            index=[ES_INDEX],
            doc_type=['watch']
        )

        return {
            'count': results['hits']['total'],
            'results': [
                hh.get('_source') for hh in results['hits']['hits']
            ]
        }
def get_context(terms, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if len(terms) > 0:
        query = {
            "query": { 
                "match": {
                    "text": {
                        "query": ' and  '.join(terms[0:]),
                        "operator" : "and"
                    }
                }
             },
            "highlight" : {
                "fields" : {
                    "text": {
                        "fragment_size" : 100, "number_of_fragments" : 1
                    }
                }
            }
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']

        highlights = []
        for hit in hits['hits']:
            highlights.append(hit['highlight']['text'][0])
        return highlights
Example #34
0
def get_elasticsearch_connection():
    es_conn = env.get_service(label='elasticsearch-swarm-1.7.1')
    if es_conn:
        es = ElasticSearch(es_conn.get_url(url='uri'))
    else:
        es = ElasticSearch('http://localhost:9200')
    return es
def get_available_domains(es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")
        
    query = {
        "query": {
            "match_all": {}
        },
    }
    res = es.search(query, 
                    index='config',
                    doc_type='domains',
                    size=100
                )

    hits = res['hits']['hits']

    res = []
    for hit in hits:
        res.append(hit['_source'])

    for i in range(0,len(res)):
        res[i]['timestamp'] = long(convert_to_epoch(datetime.strptime(res[i]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f')))
        print datetime.utcfromtimestamp(res[i]['timestamp'])
    return res
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "it-vacatures"
        # elasticsearch binden aan es
        self.es = ElasticSearch("http://localhost:9200/")

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parseWerkgever(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        p = re.compile(r"<.*?>")
        werkgever = p.sub("", str(infoTwee))
        return werkgever

    def parseLocatie(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        locatieEen = infoTwee.find_next()
        p = re.compile(r"<.*?>")
        locatieTwee = p.sub("", str(locatieEen))
        p = re.compile(r"Locatie")
        locatie = p.sub("", str(locatieTwee))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "job-description"})
        p = re.compile(r"<.*?>")
        inhoud = p.sub("", str(body))
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        # parsen
        titel = self.parseTitel(soup)
        try:
            werkgever = self.parseWerkgever(soup)
        except:
            werkgever = "-"
        try:
            locatie = self.parseLocatie(soup)
        except:
            locatie = "-"
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id (string)
        id = self.website + "-" + re.sub(r"\W+", "", titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)

        # indexeren (stoppen) van vacaturen in esDb
        self.es.index("vacature-index", "vacature", document, id=document["id"])
        print "Es: " + titel
Example #37
0
 def __init__(self, config=None, es_instance=None):
     if es_instance:
         self.es = es_instance
     else:
         self.config = config
         self.excludes_fields = self.config['excludes_fields']
         self.es = ElasticSearch('http://{host}:{port}/'.format(
             host=self.config['host'], port=self.config['port']))
Example #38
0
def _query_applications(indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port = port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices, query_params={'ignore_unavailable':'true'})

    applications = map((lambda result: result['key']), es_results['aggregations']['applications']['buckets'])
    return applications
Example #39
0
def _query_applications(product_group, indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port = port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices)

    applications = map((lambda result: result['term']), es_results['facets']['applications']['terms'])
    return applications
Example #40
0
class IitjobsParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "iitjobs"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseTitel(self, soup):
        titel = soup.head.title.string
        titel = titel.strip()
        return titel

    def parseWerkgever(self, soup):
        body = soup.find(
            "span",
            {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"})
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(body))
        werkgever = werkgever.strip()
        return werkgever

    def parseLocatie(self, soup):
        body = soup.find(
            "span",
            {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"})
        p = re.compile(r'<.*?>')
        locatie = p.sub('', str(body))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "divJobDescrip"})
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        inhoud = inhoud.strip()
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        #parsen
        titel = self.parseTitel(soup)
        werkgever = self.parseWerkgever(soup)
        locatie = self.parseLocatie(soup)
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id for website (string)
        id = self.website + "-" + re.sub(r'\W+', '', titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website,
                                     datum, werkgever, locatie, "-", inhoud)
        #indexeren (stoppen) van vacaturen in esDb
        self.es.index('vacature-index',
                      'vacature',
                      document,
                      id=document['id'])
        print('Es: ' + titel)
Example #41
0
def query(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    query = {"query": {"bool": {}}}

    #Building the query
    dict_value = dict(request.POST)
    for key in dict_value['query']:
        key = key

    value = ast.literal_eval(key)
    AndQueries = []
    OrQueries = []

    for index, key in enumerate(value['exact_query']):
        if key['condition'] == 'is equal to':
            query_values = {"term": {key['column']: key['value']}}
        if key['condition'] == 'is less than':
            query_values = {"range": {key['column']: {"lt": key['value']}}}
        if key['condition'] == 'is greater than':
            query_values = {"range": {key['column']: {"gt": key['value']}}}
        if key['condition'] == 'is less than or equal to':
            query_values = {"range": {key['column']: {"lte": key['value']}}}
        if key['condition'] == 'is greater than or equal to':
            query_values = {"range": {key['column']: {"gte": key['value']}}}
        if key['condition'] == 'is not equal to':
            query_values = {
                "must_not": {
                    "term": {
                        key['column']: key['value']
                    }
                }
            }

        if key['operation'] == 'and':
            AndQueries.append(query_values)
        if key['operation'] == 'or':
            OrQueries.append(query_values)
        if key['operation'] == '':
            if index < (len(value['exact_query']) - 1):
                next_value = value['exact_query'][index + 1]
                if next_value['operation'] == 'and':
                    AndQueries.append(query_values)
                if next_value['operation'] == 'or':
                    OrQueries.append(query_values)
            else:
                query['query']['bool']['must'] = query_values

    if len(AndQueries) != 0:
        query['query']['bool']['must'] = AndQueries
    if len(OrQueries) != 0:
        query['query']['bool']['should'] = OrQueries

    results = es.search(query, index=dict_value['index'][0], size=10000)
    return HttpResponse(json.dumps({
        'success': "Added successfully",
        'results': results
    }),
                        content_type="application/json")
Example #42
0
    def get_posts_elasticsearch(token):
        es = ElasticSearch('http://localhost:9200/')

        r = []

        for result in es.search("_type:post", index=token.lower(), size=1000)['hits']['hits']:
            r.append(result["_source"])

        return r
Example #43
0
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
Example #44
0
def _query_applications(product_group, indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port=port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices)

    applications = map((lambda result: result['term']),
                       es_results['facets']['applications']['terms'])
    return applications
def search(request, doc_type):
    """Search elastic search for any matches in the node's text"""
    term = request.GET.get('q', '')
    version = request.GET.get('version', '')
    regulation = request.GET.get('regulation', '')
    is_root = request.GET.get('is_root')
    is_subpart = request.GET.get('is_subpart')
    try:
        page = int(request.GET.get('page', '0'))
    except ValueError:
        page = 0

    if not term:
        return user_error('No query term')
    if not validate_boolean(is_root):
        return user_error('Parameter "is_root" must be "true" or "false"')
    if not validate_boolean(is_subpart):
        return user_error('Parameter "is_subpart" must be "true" or "false"')

    query = {
        'fields':
        ['text', 'label', 'version', 'regulation', 'title', 'label_string'],
        'from':
        page * PAGE_SIZE,
        'size':
        PAGE_SIZE,
    }
    text_match = {'match': {'text': term, 'doc_type': doc_type}}
    if version or regulation:
        term = {}
        if version:
            term['version'] = version
        if regulation:
            term['regulation'] = regulation
        if is_root:
            term['is_root'] = is_root
        if is_subpart:
            term['is_subpart'] = is_subpart
        query['query'] = {
            'filtered': {
                'query': text_match,
                'filter': {
                    'term': term
                }
            }
        }
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits':
        results['hits']['total'],
        'results':
        transform_results([h['fields'] for h in results['hits']['hits']])
    })
Example #46
0
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index='memex',
            doc_type='page')
 def query(self, query_dict, maxhits=10000):
     from pyelasticsearch import ElasticSearch
     if self.es_connection is None:
         es_server = self.server.rstrip('/') + ':9200'
         self.es_connection = ElasticSearch(es_server)
     results = self.es_connection.search(query_dict, index='encoded',
                                         doc_type=self.search_name,
                                         size=maxhits)
     return results
Example #48
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host,
        docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file,
        user, passwd):

    with open(config_file, "rb") as f:
        con = json.loads(f.read())
    host = con['es_config']['host']
    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if con['db']['type'] == "oracle":
        db = import_module('cx_Oracle')
        collection = db.connect(user, passwd, con['db']['con_str'])
    else:
        db = import_module('MySQLdb')
        collection = db.connect(con['db']['con_str'][0],
                                user,
                                passwd,
                                con['db']['con_str'][1],
                                charset=con['db']['con_str'][2])

    if delete_index:  # 删除索引
        try:
            stamp = 0
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            with open(settings_file, 'r') as f:
                settings_json = json.loads(f.read())
            es.create_index(index_name, settings=settings_json)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except Exception:
        echo('Index ' + index_name + ' already exists', quiet)

    echo('Using document type: ' + doc_type, quiet)

    es.put_mapping(index_name, doc_type, con['mapping'])

    parser_fun = None
    if parser is not None:
        # 加载解释函数
        parser_fun = import_module(PARSER_PATH + '.' + parser)

    documents = documents_from_file(es, collection, quiet, parser_fun, con)

    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
    print "end:" + time.strftime(
        ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
Example #49
0
class ElasticSearchTestCase(unittest.TestCase):
    def setUp(self):
        self.conn = ElasticSearch('http://localhost:9200/')

    def tearDown(self):
        self.conn.delete_index("test-index")

    def assertResultContains(self, result, expected):
        for (key, value) in expected.items():
            self.assertEquals(value, result[key])
Example #50
0
def delete(config, tree_names, all, force):
    """Delete indices and their catalog entries.
    
    This deletes the indices that have the format version of the copy of DXR
    this runs under.
    
    """
    es = ElasticSearch(config.es_hosts)
    if all:
        echo('Deleting catalog...')
        es.delete_index(config.es_catalog_index)
        # TODO: Delete tree indices as well.
    else:
        for tree_name in tree_names:
            frozen_id = '%s/%s' % (FORMAT, tree_name)
            try:
                frozen = es.get(config.es_catalog_index, TREE, frozen_id)
            except ElasticHttpNotFoundError:
                raise ClickException('No tree "%s" in catalog.' % tree_name)
            # Delete the index first. That way, if that fails, we can still
            # try again; we won't have lost the catalog entry. Refresh is
            # infrequent enough that we wouldn't avoid a race around a
            # catalogued but deleted instance the other way around.
            try:
                es.delete_index(frozen['_source']['es_alias'])
            except ElasticHttpNotFoundError:
                # It's already gone. Fine. Just remove the catalog entry.
                pass
            es.delete(config.es_catalog_index, TREE, frozen_id)
Example #51
0
def IndexData(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    for file in fileHolder:
        index = file['segment_name'].lower()
        rawfiles = file['rawfiles']
        data_for_es = file['dataFrames']
        try:
            es.delete_index(index.replace(" ", ""))
        except:
            pass
    es.create_index(index.replace(" ", ""))

    ## Loop dataframe and to elasticsearch index
    docs = json.loads(data_for_es.to_json(orient='records'))
    es.bulk((es.index_op(doc) for doc in docs),
            index=index.replace(" ", ""),
            doc_type=index)

    ##Create segment template
    file_names = []
    for file in rawfiles:
        file_names.append(file.name)

    segment = Segments(name=index,
                       files_added=",".join(file_names),
                       es_index=index.replace(" ", ""))
    segment.save()

    segment = Segments.objects.get(name=index)

    return render(request, 'analyse.html', {'segment': segment})
Example #52
0
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "ictergezocht"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseWerkgever(self, soup):
        info = soup.find(class_="highlight")
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(info))
        return werkgever

    def parseLocatie(self, soup):
        infoTwee = soup.find(class_="bf")
        locatieEen = infoTwee.find_next()
        locatieTwee = locatieEen.find_next()
        locatieDrie = locatieTwee.find_next()
        locatieVier = locatieDrie.find_next()
        p = re.compile(r'<.*?>')
        locatieVijf = p.sub('', str(locatieVier))
        p = re.compile(r'Locatie')
        locatie = p.sub('', str(locatieVijf))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find(class_="vacancybody")
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        return inhoud

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        titel = self.parseTitel(soup)
        if titel.startswith("Vacature"):
            #parsen
            werkgever = self.parseWerkgever(soup)
            locatie = self.parseLocatie(soup)
            inhoud = self.parseInhoud(soup)
            websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
            datum = time.strftime("%d-%m-%Y")
            # generate id website (string)
            id = self.website + "-" + re.sub(r'\W+', '', titel)

            #make document
            document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)
            #indexeren (stoppen) van vacaturen in esDb
            self.es.index('vacature-index', 'vacature', document, id=document['id'])
            print "Es: " + titel
Example #53
0
def update_document(url, doc):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    try:
        es.update(index='memex', doc_type='page', id=url, script=doc)
    except:
        print "Unexpected error:", sys.exc_info()[0]
        pass
Example #54
0
def search(elastic_config, fqdn):
    pattern = elastic_config.index_pattern
    lookback = elastic_config.lookback
    indices = common.get_indexes(lookback, pattern)
    hosts = elastic_config.hosts
    port = elastic_config.port
    username = elastic_config.username
    password = elastic_config.password
    environment = elastic_config.environment
    es = ElasticSearch(hosts, port=port, username=username, password=password)
    #try:
    doc = es.search(common.build_query(fqdn, environment), index=indices)
    return doc, fqdn
Example #55
0
def update_process_datetime(doc_id, timestamp):
    ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
    connection_string = 'http://localhost:9200'
    process_index = 'openfdametadata'
    _type = 'last_run'
    _map = {}
    _map[_type] = {}
    _map[_type]['properties'] = {}
    _map[_type]['properties']['last_update_date'] = {}
    _map[_type]['properties']['last_update_date']['type'] = 'date'
    _map[_type]['properties']['last_update_date'][
        'format'] = 'dateOptionalTime'

    es = ElasticSearch(connection_string)
    try:
        es.create_index(process_index)
        logging.info('Creating index %s', process_index)
    except exceptions.IndexAlreadyExistsError as e:
        logging.info('%s already exists', process_index)

    try:
        es.put_mapping(process_index, doc_type=_type, mapping=_map)
        logging.info('Successfully created mapping')
    except:
        logging.fatal('Could not create the mapping')

    new_doc = {}
    new_doc['last_update_date'] = timestamp
    es.index(process_index,
             doc_type=_type,
             id=doc_id,
             doc=new_doc,
             overwrite_existing=True)
Example #56
0
    def send(self, messages):
        if self.type == '@type':
            self.type = messages[0].get('@type')
            logger.debug('Type is \'@type\' - setting it to %r', self.type)

        es = ElasticSearch('http://%s:%s' % (self.host, self.port))

        now = datetime.utcnow()
        index = now.strftime('logstash-%Y.%m.%d')

        result = es.bulk_index(index=index, doc_type=self.type, docs=messages)
        logger.debug('Elasticsearch bulk_index run returned with:\n\n%s\n',
                     pformat(result))
        return True
Example #57
0
class ESRegulations(object):
    """Implementation of Elastic Search as regulations backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def get(self, label, version):
        """Find the regulation label + version"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                                 version + '/' + label)

            reg_node = result['_source']
            del reg_node['regulation']
            del reg_node['version']
            del reg_node['label_string']
            del reg_node['id']
            return reg_node
        except ElasticHttpNotFoundError:
            return None

    def _transform(self, reg, version):
        """Add some meta data fields which are ES specific"""
        node = dict(reg)  # copy
        node['version'] = version
        node['label_string'] = '-'.join(node['label'])
        node['regulation'] = node['label'][0]
        node['id'] = version + '/' + node['label_string']
        node['root'] = len(node['label']) == 1
        return node

    def bulk_put(self, regs, version, root_label):
        """Store all reg objects"""
        self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                           map(lambda r: self._transform(r, version), regs))

    def listing(self, label=None):
        """List regulation version-label pairs that match this label (or are
        root, if label is None)"""
        if label is None:
            query = {'match': {'root': True}}
        else:
            query = {'match': {'label_string': label}}
        query = {'fields': ['label_string', 'version'], 'query': query}
        result = self.es.search(query,
                                index=settings.ELASTIC_SEARCH_INDEX,
                                doc_type='reg_tree',
                                size=100)
        return sorted((res['fields']['version'], res['fields']['label_string'])
                      for res in result['hits']['hits'])