Example #1
0
def search_posts_elasticsearch(token):
    es = ElasticSearch('http://localhost:9200/')

    #for result in es.search("_type:post", index=token.lower())['hits']['hits']:
    #    print result["_source"]

    print es.search("id:sdifhsdihf", index="caacedeose0cban4zbmltsbcyxgzbzfrvq7uiqksk1uxep0njzgza7jtxei59ekp1izcjbg9czbum5qm0ojjuekaa3vwnn8tnxezcplgyaa2esvpi1dzcycai6xyvfwbrzco8quwns9orejsbecktw738yglnevljlqeascfgdfc0xdrjc1s0n40uun4ypytklsjarzand9gtfazdzd")
Example #2
0
class GetSuggestions():
    '''
    Used to search in elastic for simmilar prepid as given
    '''

    def __init__(self, typeof):
        self.es = ElasticSearch(config.DATABASE_URL)
        self.overflow = 20
        self.announced = (typeof == 'announced')
        self.growing = (typeof == 'growing')
        self.historical = (typeof == 'historical')
        self.performance = (typeof == 'performance')

    def get(self, query):
        searchable = query.replace('-', '\-')
        if '-' in query:
            search = ('prepid:%s' % searchable)
            search_stats = ('pdmv_request_name:%s' % searchable)
        else:
            search = ('prepid:*%s*' % searchable)
            search_stats = ('pdmv_request_name:*%s*' % searchable)

        ext0 = []
        ext1 = []
        ext2 = []

        if (self.historical or self.growing or self.announced
            or self.performance):
            # campaigns are expected in all modes
            ext0 = [s['_id'] for s in
                    self.es.search(search, index='campaigns',
                                   size=self.overflow)['hits']['hits']]

            # extended search for historical
            if self.historical:
                ext1 = [s['_id'] for s in
                        self.es.search(search, index='requests',
                                       size=self.overflow)['hits']['hits']]

                ext2 = [s['_id'] for s in
                        self.es.search(search_stats, index='stats',
                                       size=self.overflow)['hits']['hits']]

            # extended search fo growing
            if self.growing:
                ext1 = [s['_id'] for s in
                        self.es.search(search, index="chained_campaigns",
                                       size=self.overflow)['hits']['hits']]

                ext2 = [s['_id'] for s in
                        self.es.search(search, index="chained_requests",
                                       size=self.overflow)['hits']['hits']]

        # order of ext does matter because of the typeahead in bootstrap
        return json.dumps({"results": ext0 + ext1 + ext2})
Example #3
0
def get_image(url, output_path=""):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if output_path:
        output_path = output_path+'/'

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail"]
        }
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        if (len(hits) > 0):
            img = base64.b64decode(hits['hits'][0]['fields']['thumbnail'][0])
            with open(output_path+urllib2.quote(url).replace("/", "%2F")+'.png','wb') as f:
                f.write(img)
        else:
            print "No thumbnail found"
Example #4
0
class ENC_Collection(object):
    def __init__(self, connection, supplied_name, frame="object"):
        if supplied_name.endswith("s"):
            self.name = supplied_name.replace("_", "-")
            self.search_name = supplied_name.rstrip("s").replace("-", "_")
            self.schema_name = self.search_name + ".json"
        elif supplied_name.endswith(".json"):
            self.name = supplied_name.replace("_", "-").rstrip(".json")
            self.search_name = supplied_name.replace("-", "_").rstrip(".json")
            self.schema_name = supplied_name
        else:
            self.name = supplied_name.replace("_", "-") + "s"
            self.search_name = supplied_name.replace("-", "_")
            self.schema_name = supplied_name.replace("-", "_") + ".json"
        schema_uri = "/profiles/" + self.schema_name
        self.connection = connection
        self.server = connection.server
        self.schema = get_ENCODE(schema_uri, connection)
        self.frame = frame
        search_string = "/search/?format=json&limit=all&type=%s&frame=%s" % (self.search_name, frame)
        collection = get_ENCODE(search_string, connection)
        self.items = collection["@graph"]
        self.es_connection = None

    def query(self, query_dict, maxhits=10000):
        from pyelasticsearch import ElasticSearch

        if self.es_connection == None:
            es_server = self.server.rstrip("/") + ":9200"
            self.es_connection = ElasticSearch(es_server)
        results = self.es_connection.search(query_dict, index="encoded", doc_type=self.search_name, size=maxhits)
        return results
Example #5
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #6
0
def get_image(url):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
Example #7
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "match": {
                    field: {
                        "query": ' '.join(queryStr),
                        "minimum_should_match": "100%"
                    }
                }
            },
            "fields": ["url"]
        }
        print query
        res = es.search(query,
                        index=environ['ELASTICSEARCH_INDEX']
                        if environ.get('ELASTICSEARCH_INDEX') else 'memex',
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE']
                        if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
                        size=500)

        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #8
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query" : {
                "match": {
                    field: {
                        "query": ' '.join(queryStr),
                        "minimum_should_match":"100%"
                    }
                }
            },
            "fields": ["url"]
        }
        print query
        res = es.search(query, 
                        index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
                        size=500)

        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #9
0
def get_image(url):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query, 
                        index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                        doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
Example #10
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']
        
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields" : [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #11
0
class ESNotices(object):
    """Implementation of Elastic Search as notice backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def put(self, doc_number, notice):
        """Store a single notice"""
        self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice,
                      id=doc_number)

    def get(self, doc_number):
        """Find the associated notice"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice',
                                 doc_number)

            return result['_source']
        except ElasticHttpNotFoundError:
            return None

    def listing(self, part=None):
        """All notices or filtered by cfr_part"""
        if part:
            query = {'match': {'cfr_part': part}}
        else:
            query = {'match_all': {}}
        query = {'fields': ['effective_on', 'fr_url', 'publication_date'],
                 'query': query}
        notices = []
        results = self.es.search(query, doc_type='notice', size=100,
                                 index=settings.ELASTIC_SEARCH_INDEX)
        for notice in results['hits']['hits']:
            notice['fields']['document_number'] = notice['_id']
            notices.append(notice['fields'])
        return notices
Example #12
0
class ENC_Collection:
    def __init__(self, connection, supplied_name, frame='object'):
        if supplied_name.endswith('s'):
            self.name = supplied_name.replace('_', '-')
            self.search_name = supplied_name.rstrip('s').replace('-', '_')
            self.schema_name = self.search_name + '.json'
        elif supplied_name.endswith('.json'):
            self.name = supplied_name.replace('_', '-').rstrip('.json')
            self.search_name = supplied_name.replace('-', '_').rstrip('.json')
            self.schema_name = supplied_name
        else:
            self.name = supplied_name.replace('_', '-') + 's'
            self.search_name = supplied_name.replace('-', '_')
            self.schema_name = supplied_name.replace('-', '_') + '.json'
        schema_uri = '/profiles/' + self.schema_name
        self.connection = connection
        self.server = connection.server
        self.schema = get_ENCODE(schema_uri, connection)
        self.frame = frame
        search_string = '/search/?format=json&limit=all&type=%s&frame=%s' % (
            self.search_name, frame)
        collection = get_ENCODE(search_string, connection)
        self.items = collection['@graph']
        self.es_connection = None

    def query(self, query_dict, maxhits=10000):
        from pyelasticsearch import ElasticSearch
        if self.es_connection == None:
            es_server = self.server.rstrip('/') + ':9200'
            self.es_connection = ElasticSearch(es_server)
        results = self.es_connection.search(query_dict,
                                            index='encoded',
                                            doc_type=self.search_name,
                                            size=maxhits)
        return results
Example #13
0
def search(field, queryStr):
    es_server = 'http://localhost:9200/'
    es_index = 'memex'
    es_doc_type = 'page'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    if environ.get('ELASTICSEARCH_INDEX'):
        es_index = environ['ELASTICSEARCH_INDEX']
    if environ.get('ELASTICSEARCH_DOC_TYPE'):
        es_doc_type = environ['ELASTICSEARCH_DOC_TYPE']

    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query": {
                "query_string": {
                    "fields": [field],
                    "query": ' and  '.join(queryStr[0:]),
                }
            },
            "fields": [field]
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type)
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        return hits['hits']
Example #14
0
def term_search(field, queryStr):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(queryStr) > 0:
        query = {
            "query" : {
                "match": {
                    field: {
                        "query": queryStr,
                        "operator" : "and"
                        }
                    }
                },
            "fields": ["url"]
            }
        print query
        res = es.search(query, index='memex', doc_type='page', size=500)
        hits = res['hits']
        urls = []
        for hit in hits['hits']:
            urls.append(hit['_id'])
        return urls
Example #15
0
def get_context(terms):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(terms) > 0:

        query = {
            "query": { 
                "match": {
                    "text": {
                        "query": ' and  '.join(terms[0:]),
                        "operator" : "and"
                    }
                }
             },
            "highlight" : {
                "fields" : {
                    "text": {
                        "fragment_size" : 100, "number_of_fragments" : 1
                    }
                }
            }
        }
        print query
        res = es.search(query, index='memex', doc_type='page')
        hits = res['hits']
        print 'Document found: %d' % hits['total']
        highlights = []
        for hit in hits['hits']:
            highlights.append(hit['highlight']['text'])
        return highlights
def get_available_domains(es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")
        
    query = {
        "query": {
            "match_all": {}
        },
    }
    res = es.search(query, 
                    index='config',
                    doc_type='domains',
                    size=100
                )

    hits = res['hits']['hits']

    res = []
    for hit in hits:
        res.append(hit['_source'])

    for i in range(0,len(res)):
        res[i]['timestamp'] = long(convert_to_epoch(datetime.strptime(res[i]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f')))
        print datetime.utcfromtimestamp(res[i]['timestamp'])
    return res
def get_context(terms, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if len(terms) > 0:
        query = {
            "query": { 
                "match": {
                    "text": {
                        "query": ' and  '.join(terms[0:]),
                        "operator" : "and"
                    }
                }
             },
            "highlight" : {
                "fields" : {
                    "text": {
                        "fragment_size" : 100, "number_of_fragments" : 1
                    }
                }
            }
        }
        print query
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
        hits = res['hits']

        highlights = []
        for hit in hits['hits']:
            highlights.append(hit['highlight']['text'][0])
        return highlights
    def _get(self):
        """Build and run the ES query
        """
        opts = self.opts

        es = ElasticSearch(opts.url)
        query = {'sort': {'@timestamp': 'desc'},
                 'size': 1}

        if opts.query:
            query['query'] = {
                'filtered': {
                    'query': {
                        'query_string': {
                            'query': opts.query
                        }
                    }
                }
            }

        # ElasticSearch allows us to pass an array of indices. However,
        # it will throw an exception if any of these don't exist. This
        # isn't the right behavior, because there may not actually be
        # a logstash index from X days ago. Instead, we need to iterate
        # through the daily log indexes in reverse order until we get a
        # non-error response.
        result = None
        for index in self._indexes():
            try:
                result = es.search(query, index=index)
                break
            except ElasticHttpNotFoundError, e:
                pass
Example #19
0
def get_documents(urls):
    host =  environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'http://localhost:9200'
    es = ElasticSearch(host)
        
    if len(urls) > 0:
        results = {}

        for url in urls:
            query = {
                "query": {
                    "term": {
                        "url": url
                    }
                },
                "fields": ["text"]
            }
        
            res = es.search(query, 
                            index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                            doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
            hits = res['hits']
            try:
                results[url] = hits['hits'][0]['fields']['text'][0]
            except KeyError, e:
                print url, e, " not found in database"
            except IndexError, e:
                print url, e, " not found in database"
def range(field, from_val, to_val, ret_fields=[], epoch=None, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if not (epoch is None):
        if epoch:
            from_val = datetime.utcfromtimestamp(long(from_val)).strftime('%Y-%m-%dT%H:%M:%S')
            to_val = datetime.utcfromtimestamp(long(to_val)).strftime('%Y-%m-%dT%H:%M:%S')
            
    query = { 
        "query" : { 
            "range" : { 
                field : {
                    "from": from_val,
                    "to": to_val
                }
            },
        },
        "fields": ret_fields
    }

    res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)
    hits = res['hits']['hits']

    results=[]
    for hit in hits:
        results.append(hit['fields'])

    return results
def get_image(url, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch("http://localhost:9200")

    if url:
        query = {
            "query": {
                "term": {
                    "url": url
                }
            },
            "fields": ["thumbnail", "thumbnail_name"]
        }
        res = es.search(query, index=es_index, doc_type=es_doc_type, size=500)

        hits = res['hits']['hits']
        if (len(hits) > 0):
            try:
                img = base64.b64decode(hits[0]['fields']['thumbnail'][0])
                img_name = hits[0]['fields']['thumbnail_name'][0]
                return [img_name, img]
            except KeyError:
                print "No thumbnail found"
        else:
            print "No thumbnail found"
    return [None, None]
def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = ElasticSearch('http://localhost:9200/')

    if len(terms) > 0:
        results = {}

        for term in terms:
            query = {
                "query": {
                    "term": {
                        term_field: term
                    }
                },
                "fields": fields
            }
        
            res = es.search(query, 
                            index=es_index,
                            doc_type=es_doc_type)

            if res['hits']['hits']:
                hits = res['hits']['hits'][0]

                if not hits.get('fields') is None:
                    hits = hits['fields']
                    record = {}
                    for field in fields:
                        if(not hits.get(field) is None):
                            record[field] = hits[field][0]
                    results[term] = record           
            
    return results
Example #23
0
class ENC_Collection(object):
    def __init__(self, connection, supplied_name, frame='object'):
        if supplied_name.endswith('s'):
            self.name = supplied_name.replace('_', '-')
            self.search_name = supplied_name.rstrip('s').replace('-', '_')
            self.schema_name = self.search_name + '.json'
        elif supplied_name.endswith('.json'):
            self.name = supplied_name.replace('_', '-').rstrip('.json')
            self.search_name = supplied_name.replace('-', '_').rstrip('.json')
            self.schema_name = supplied_name
        else:
            self.name = supplied_name.replace('_', '-') + 's'
            self.search_name = supplied_name.replace('-', '_')
            self.schema_name = supplied_name.replace('-', '_') + '.json'
        schema_uri = '/profiles/' + self.schema_name
        self.connection = connection
        self.server = connection.server
        self.schema = get_ENCODE(schema_uri, connection)
        self.frame = frame
        search_string = '/search/?format=json&limit=all&\
                        type=%s&frame=%s' % (self.search_name, frame)
        collection = get_ENCODE(search_string, connection)
        self.items = collection['@graph']
        self.es_connection = None

    def query(self, query_dict, maxhits=10000):
        from pyelasticsearch import ElasticSearch
        if self.es_connection is None:
            es_server = self.server.rstrip('/') + ':9200'
            self.es_connection = ElasticSearch(es_server)
        results = self.es_connection.search(query_dict, index='encoded',
                                            doc_type=self.search_name,
                                            size=maxhits)
        return results
Example #24
0
    def search(q):
        """
        Implement search method with ElasticSearch
        """

        # Create connection
        es = ElasticSearch(ES_URL)

        # Get results from index
        results = es.search(
            {
                "query": {
                    "query_string": {
                        "query": q
                    }
                }
            },
            index=[ES_INDEX],
            doc_type=['watch']
        )

        return {
            'count': results['hits']['total'],
            'results': [
                hh.get('_source') for hh in results['hits']['hits']
            ]
        }
Example #25
0
def search(request, doc_type, search_args):
    """Search elastic search for any matches in the node's text"""
    query = {
        'fields': ['text', 'label', 'version', 'regulation', 'title',
                   'label_string'],
        'from': search_args.page * search_args.page_size,
        'size': search_args.page_size,
    }
    text_match = {'match': {'text': search_args.q, 'doc_type': doc_type}}
    if search_args.version or search_args.regulation:
        term = {}
        if search_args.version:
            term['version'] = search_args.version
        if search_args.regulation:
            term['regulation'] = search_args.regulation
        if search_args.is_root is not None:
            term['is_root'] = search_args.is_root
        if search_args.is_subpart is not None:
            term['is_subpart'] = search_args.is_subpart
        query['query'] = {'filtered': {
            'query': text_match,
            'filter': {'term': term}
        }}
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits': results['hits']['total'],
        'results': transform_results([h['fields'] for h in
                                      results['hits']['hits']])
    })
Example #26
0
def _query_applications(product_group, indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port = port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices)

    applications = map((lambda result: result['term']), es_results['facets']['applications']['terms'])
    return applications
Example #27
0
def _query_applications(indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port = port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices, query_params={'ignore_unavailable':'true'})

    applications = map((lambda result: result['key']), es_results['aggregations']['applications']['buckets'])
    return applications
Example #28
0
def query(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    query = {"query": {"bool": {}}}

    #Building the query
    dict_value = dict(request.POST)
    for key in dict_value['query']:
        key = key

    value = ast.literal_eval(key)
    AndQueries = []
    OrQueries = []

    for index, key in enumerate(value['exact_query']):
        if key['condition'] == 'is equal to':
            query_values = {"term": {key['column']: key['value']}}
        if key['condition'] == 'is less than':
            query_values = {"range": {key['column']: {"lt": key['value']}}}
        if key['condition'] == 'is greater than':
            query_values = {"range": {key['column']: {"gt": key['value']}}}
        if key['condition'] == 'is less than or equal to':
            query_values = {"range": {key['column']: {"lte": key['value']}}}
        if key['condition'] == 'is greater than or equal to':
            query_values = {"range": {key['column']: {"gte": key['value']}}}
        if key['condition'] == 'is not equal to':
            query_values = {
                "must_not": {
                    "term": {
                        key['column']: key['value']
                    }
                }
            }

        if key['operation'] == 'and':
            AndQueries.append(query_values)
        if key['operation'] == 'or':
            OrQueries.append(query_values)
        if key['operation'] == '':
            if index < (len(value['exact_query']) - 1):
                next_value = value['exact_query'][index + 1]
                if next_value['operation'] == 'and':
                    AndQueries.append(query_values)
                if next_value['operation'] == 'or':
                    OrQueries.append(query_values)
            else:
                query['query']['bool']['must'] = query_values

    if len(AndQueries) != 0:
        query['query']['bool']['must'] = AndQueries
    if len(OrQueries) != 0:
        query['query']['bool']['should'] = OrQueries

    results = es.search(query, index=dict_value['index'][0], size=10000)
    return HttpResponse(json.dumps({
        'success': "Added successfully",
        'results': results
    }),
                        content_type="application/json")
Example #29
0
def cli(index_name, doc_type, file_name, size):
    """
    Export data from ElasticSearch to CSV file.

    \b
    Help:
        python es2csv.py --help

    \b
    Example:
        python es2csv.py --index-name=index_name --doc-type=typename
            --file-name=/tmp/save_file.csv
    """
    es = ElasticSearch(ES_CONF['host'])
    mapping = es.get_mapping(index=index_name, doc_type=doc_type)
    fieldnames = mapping[index_name]['mappings'][doc_type]['properties'].keys()
    print "Fields Total: %d" % len(fieldnames)

    writer = csv.writer(file(file_name, 'wb'), quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(fieldnames)
    print fieldnames

    data = es.search("*", index=index_name, doc_type=doc_type, size=1)
    total = data['hits']['total']
    print "Total: %d" % total

    size = 1000
    for es_from in range(0, total + 1, size):
        data = es.search("*",
                         index=index_name,
                         doc_type=doc_type,
                         es_from=es_from,
                         size=size)
        data = data['hits']['hits']
        format_data = []
        for row in data:
            for k in fieldnames:
                if k not in row['_source']:
                    row['_source'][k] = ''
            format_data.append([row['_source'][k] for k in fieldnames])

        writer.writerows(format_data)
        print "Saved count %d" % (es_from + size)

    print 'ok'
Example #30
0
class GetAnnounced():
    '''
    Used to return list of requests with some properties in a given campaign
    '''
    def __init__(self):
        self.es = ElasticSearch(config.DATABASE_URL)
        self.overflow = 1000000

    def get(self, campaign):

        # change all to wildcard
        if campaign == 'all':
            campaign = '*'

        # get list of requests - field has to be not analyzed by es
        res = [s['_source'] for s in
               self.es.search(('member_of_campaign:%s' % campaign),
                              index='requests', size=self.overflow)
               ['hits']['hits']]

        # loop over and parse the db data
        for r in res:
            # requests that are done should have completed events value
            if r['status'] == 'done':
                r['total_events'] = r['completed_events']
                try:
                    # requests without output_dataset should have zero events
                    if not len(r['output_dataset']):
                        r['total_events'] = 0
                except KeyError:
                    r['total_events'] = 0
                    pass
            if r['status'] == 'submitted':
                try:
                    if not len(r['reqmgr_name']):
                        r['total_events'] = 0
                except KeyError:
                    r['total_events'] = 0
                    pass

            # requests that are new (-1) should have zero events
            if r['total_events'] == -1:
                r['total_events'] = 0

            if r['time_event'] == -1:
                r['time_event'] = 0

            # remove unnecessary fields to speed up api
            try:
                del r['completed_events']
                del r['reqmgr_name']
                del r['history']
                del r['output_dataset']
            except KeyError:
                print r['prepid']

        return json.dumps({"results": res})
Example #31
0
def fuzzysearch(name):
    es = ElasticSearch(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    query = {
            "query":{
                "fuzzy":{
                    "_all": str(name)}}}
    res = es.search(query, index=settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME'])
    id_list = get_template_id(res) 
    return (Template.objects.filter(id=id).first() for id in id_list)
Example #32
0
    def get_posts_elasticsearch(token):
        es = ElasticSearch('http://localhost:9200/')

        r = []

        for result in es.search("_type:post", index=token.lower(), size=1000)['hits']['hits']:
            r.append(result["_source"])

        return r
Example #33
0
def _query_applications(product_group, indices):
    hosts = [_url_for_host(env)]

    es = ElasticSearch(hosts, port=port)
    es_results = es.search(APPLICATIONS_QUERY, index=indices)

    applications = map((lambda result: result['term']),
                       es_results['facets']['applications']['terms'])
    return applications
def search(request, doc_type):
    """Search elastic search for any matches in the node's text"""
    term = request.GET.get('q', '')
    version = request.GET.get('version', '')
    regulation = request.GET.get('regulation', '')
    is_root = request.GET.get('is_root')
    is_subpart = request.GET.get('is_subpart')
    try:
        page = int(request.GET.get('page', '0'))
    except ValueError:
        page = 0

    if not term:
        return user_error('No query term')
    if not validate_boolean(is_root):
        return user_error('Parameter "is_root" must be "true" or "false"')
    if not validate_boolean(is_subpart):
        return user_error('Parameter "is_subpart" must be "true" or "false"')

    query = {
        'fields':
        ['text', 'label', 'version', 'regulation', 'title', 'label_string'],
        'from':
        page * PAGE_SIZE,
        'size':
        PAGE_SIZE,
    }
    text_match = {'match': {'text': term, 'doc_type': doc_type}}
    if version or regulation:
        term = {}
        if version:
            term['version'] = version
        if regulation:
            term['regulation'] = regulation
        if is_root:
            term['is_root'] = is_root
        if is_subpart:
            term['is_subpart'] = is_subpart
        query['query'] = {
            'filtered': {
                'query': text_match,
                'filter': {
                    'term': term
                }
            }
        }
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits':
        results['hits']['total'],
        'results':
        transform_results([h['fields'] for h in results['hits']['hits']])
    })
Example #35
0
def cli(index_name, doc_type, file_name, size):
    """
    Export data from ElasticSearch to CSV file.

    \b
    Help:
        python es2csv.py --help

    \b
    Example:
        python es2csv.py --index-name=index_name --doc-type=typename
            --file-name=/tmp/save_file.csv
    """
    es = ElasticSearch(ES_CONF['host'])
    mapping = es.get_mapping(index=index_name, doc_type=doc_type)
    fieldnames = mapping[index_name]['mappings'][doc_type]['properties'].keys()
    print "Fields Total: %d" % len(fieldnames)

    writer = csv.writer(file(file_name, 'wb'), quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(fieldnames)
    print fieldnames

    data = es.search("*", index=index_name, doc_type=doc_type, size=1)
    total = data['hits']['total']
    print "Total: %d" % total

    size = 1000
    for es_from in range(0, total+1, size):
        data = es.search("*", index=index_name, doc_type=doc_type,
                         es_from=es_from, size=size)
        data = data['hits']['hits']
        format_data = []
        for row in data:
            for k in fieldnames:
                if k not in row['_source']:
                    row['_source'][k] = ''
            format_data.append([row['_source'][k] for k in fieldnames])

        writer.writerows(format_data)
        print "Saved count %d" % (es_from + size)

    print 'ok'
Example #36
0
def dump_as_index_file():
    
    es = ElasticSearch(CONTEXT['datahub-store'])
    
    total = es.count("owner:public AND display:timeline",index=CONTEXT['datahub-index'],doc_type='_all')
    
    series = es.search("owner:public AND display:timeline",index=CONTEXT['datahub-index'],size=total['count'],doc_type='_all')
    
    f = open(CONTEXT['correlation-index-path'],mode='w')
    
    for serie in series['hits']['hits']:
        f.write("%s;%s;%s;%s\n" % (serie['_id'],serie['_source']['name'],cjson.encode(serie['_source']['data']['series'][0]['data']),serie['_source']['category']))
    f.close()
Example #37
0
def search(elastic_config, fqdn):
    pattern = elastic_config.index_pattern
    lookback = elastic_config.lookback
    indices = common.get_indexes(lookback, pattern)
    hosts = elastic_config.hosts
    port = elastic_config.port
    username = elastic_config.username
    password = elastic_config.password
    environment = elastic_config.environment
    es = ElasticSearch(hosts, port=port, username=username, password=password)
    #try:
    doc = es.search(common.build_query(fqdn, environment), index=indices)
    return doc, fqdn
Example #38
0
def search(elastic_config, fqdn):
    pattern = elastic_config.index_pattern
    lookback = elastic_config.lookback
    indices = common.get_indexes(lookback, pattern)
    hosts = elastic_config.hosts
    port = elastic_config.port
    username = elastic_config.username
    password = elastic_config.password
    environment = elastic_config.environment
    es = ElasticSearch(hosts, port=port, username=username, password=password)
    #try:
    doc = es.search(common.build_query(fqdn, environment), index=indices)
    return doc, fqdn
Example #39
0
class GetSuggestions():

    def __init__(self, typeof):
        self.es = ElasticSearch(config.DATABASE_URL)
        self.overflow = 20
        self.lifetime = (typeof == 'lifetime')
        self.on = (typeof == 'true')

    def get(self, query):

        searchable = query.replace('-', '\-')

        if self.lifetime:

            if '-' in query:
                search_string = ('prepid:%s' % searchable)
                search_stats = ('pdmv_request_name:%s' % searchable)
            else:
                search_string = ('prepid:*%s*' % searchable)
                search_stats = ('pdmv_request_name:*%s*' % searchable)

            campa = [s['_id'] for s in
                     self.es.search(search_string, index='campaigns',
                                    size=self.overflow)['hits']['hits']]

            reque = [s['_id'] for s in
                     self.es.search(search_string, index='requests',
                                    size=self.overflow)['hits']['hits']]

            stats = [s['_id'] for s in
                     self.es.search(search_stats, index='stats',
                                    size=self.overflow)['hits']['hits']]

            return json.dumps({'results': campa + reque + stats})

        else:
            if '-' in query:
                search_string = ('prepid:%s' % searchable)
            else:
                search_string = ('prepid:*%s*' % searchable)

            if self.on:
                return json.dumps(
                    {"results": [s['_id'] for s in
                                 self.es.search(search_string,
                                                index="chained_campaigns",
                                                size=self.overflow)
                                 ['hits']['hits']]
                     + [s['_id'] for s in
                        self.es.search(search_string, index="chained_requests",
                                       size=self.overflow)['hits']['hits']]})
            else:
                return json.dumps(
                    {"results": [s['_id'] for s in
                                 self.es.search(self.search_string,
                                                index="campaigns",
                                                size=self.overflow)
                                 ['hits']['hits']]})
Example #40
0
def search(request, doc_type):
    """Search elastic search for any matches in the node's text"""
    term = request.GET.get('q', '')
    version = request.GET.get('version', '')
    regulation = request.GET.get('regulation', '')
    is_root = request.GET.get('is_root')
    is_subpart = request.GET.get('is_subpart')
    try:
        page = int(request.GET.get('page', '0'))
    except ValueError:
        page = 0

    if not term:
        return user_error('No query term')
    if not validate_boolean(is_root):
        return user_error('Parameter "is_root" must be "true" or "false"')
    if not validate_boolean(is_subpart):
        return user_error('Parameter "is_subpart" must be "true" or "false"')

    query = {
        'fields': ['text', 'label', 'version', 'regulation', 'title',
                   'label_string'],
        'from': page * PAGE_SIZE,
        'size': PAGE_SIZE,
    }
    text_match = {'match': {'text': term, 'doc_type': doc_type}}
    if version or regulation:
        term = {}
        if version:
            term['version'] = version
        if regulation:
            term['regulation'] = regulation
        if is_root:
            term['is_root'] = is_root
        if is_subpart:
            term['is_subpart'] = is_subpart
        query['query'] = {'filtered': {
            'query': text_match,
            'filter': {'term': term}
        }}
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits': results['hits']['total'],
        'results': transform_results([h['fields'] for h in
                                      results['hits']['hits']])
    })
Example #41
0
class GetCampaign():

    def __init__(self):
        self.es = ElasticSearch(config.DATABASE_URL)
        self.overflow = 1000000

    def get(self, campaign):
        if campaign == 'all':
            campaign = '*'
        return json.dumps(
            {"results": [s['_source'] for s in
                         self.es.search(('member_of_campaign:%s' % campaign),
                                        index='requests',
                                        size=self.overflow)['hits']['hits']]})
Example #42
0
class ESRegulations(object):
    """Implementation of Elastic Search as regulations backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def get(self, label, version):
        """Find the regulation label + version"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                                 version + '/' + label)

            reg_node = result['_source']
            del reg_node['regulation']
            del reg_node['version']
            del reg_node['label_string']
            del reg_node['id']
            return reg_node
        except ElasticHttpNotFoundError:
            return None

    def _transform(self, reg, version):
        """Add some meta data fields which are ES specific"""
        node = dict(reg)  # copy
        node['version'] = version
        node['label_string'] = '-'.join(node['label'])
        node['regulation'] = node['label'][0]
        node['id'] = version + '/' + node['label_string']
        node['root'] = len(node['label']) == 1
        return node

    def bulk_put(self, regs, version, root_label):
        """Store all reg objects"""
        self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                           map(lambda r: self._transform(r, version), regs))

    def listing(self, label=None):
        """List regulation version-label pairs that match this label (or are
        root, if label is None)"""
        if label is None:
            query = {'match': {'root': True}}
        else:
            query = {'match': {'label_string': label}}
        query = {'fields': ['label_string', 'version'], 'query': query}
        result = self.es.search(query,
                                index=settings.ELASTIC_SEARCH_INDEX,
                                doc_type='reg_tree',
                                size=100)
        return sorted((res['fields']['version'], res['fields']['label_string'])
                      for res in result['hits']['hits'])
Example #43
0
class ESRegulations(object):
    """Implementation of Elastic Search as regulations backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def get(self, label, version):
        """Find the regulation label + version"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                                 version + '/' + label)

            reg_node = result['_source']
            del reg_node['regulation']
            del reg_node['version']
            del reg_node['label_string']
            del reg_node['id']
            return reg_node
        except ElasticHttpNotFoundError:
            return None

    def _transform(self, reg, version):
        """Add some meta data fields which are ES specific"""
        node = dict(reg)    # copy
        node['version'] = version
        node['label_string'] = '-'.join(node['label'])
        node['regulation'] = node['label'][0]
        node['id'] = version + '/' + node['label_string']
        node['root'] = len(node['label']) == 1
        return node

    def bulk_put(self, regs, version, root_label):
        """Store all reg objects"""
        self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                           map(lambda r: self._transform(r, version), regs))

    def listing(self, label=None):
        """List regulation version-label pairs that match this label (or are
        root, if label is None)"""
        if label is None:
            query = {'match': {'root': True}}
        else:
            query = {'match': {'label_string': label}}
        query = {'fields': ['label_string', 'version'], 'query': query}
        result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX,
                                doc_type='reg_tree', size=100)
        return sorted((res['fields']['version'], res['fields']['label_string'])
                      for res in result['hits']['hits'])
Example #44
0
def search(request):
    """Search elastic search for any matches in the node's text"""
    term = request.GET.get('q', '')
    version = request.GET.get('version', '')
    regulation = request.GET.get('regulation', '')
    try:
        page = int(request.GET.get('page', '0'))
    except ValueError:
        page = 0

    if not term:
        return user_error('No query term')

    query = {
        'fields':
        ['text', 'label', 'version', 'regulation', 'title', 'label_string'],
        'from':
        page * PAGE_SIZE,
        'size':
        PAGE_SIZE,
    }
    text_match = {'match': {'text': term}}
    if version or regulation:
        term = {}
        if version:
            term['version'] = version
        if regulation:
            term['regulation'] = regulation
        query['query'] = {
            'filtered': {
                'query': text_match,
                'filter': {
                    'term': term
                }
            }
        }
    else:
        query['query'] = text_match
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX)

    return success({
        'total_hits':
        results['hits']['total'],
        'results':
        transform_results([h['fields'] for h in results['hits']['hits']])
    })
Example #45
0
def search(query):
    pattern = elastic_config.index_pattern
    lookback = elastic_config.lookback
    indices = common.get_indexes(lookback, pattern)
    hosts = elastic_config.hosts
    port = elastic_config.port
    username = elastic_config.username
    password = elastic_config.password
    environment = elastic_config.environment
    es = ElasticSearch(hosts, port=443, username=username, password=password)
    try:
        logging.info("Querying Elasticsearch using {0}".format(query))
        doc = es.search(query, index=indices)
        return doc
    except:
        logging.error(
            "Unexpected error searching for {0}. Passing".format(query))
        pass
Example #46
0
class SearchModel(object):


	def __init__(self):
		connection_url = settings.HAYSTACK_CONNECTIONS['default']['URL']
		self.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME']
		self.elastic = ElasticSearch(connection_url)


	def find(self, field=None, term=None):
		search  = self.elastic.search('{0}:{1}'.format(field, term), index=self.index,)

		results = None
		hits = search.get('hits', None)

		if hits != None:
			results = hits.get('hits', None)

		return results
Example #47
0
def get_documents(urls):
    es_server = 'http://localhost:9200/'
    if environ.get('ELASTICSEARCH_SERVER'):
        es_server = environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    if len(urls) > 0:
        results = {}

        for url in urls:
            query = {"query": {"term": {"url": url}}, "fields": ["text"]}

            res = es.search(query, index='memex', doc_type='page')
            hits = res['hits']
            try:
                results[url] = hits['hits'][0]['fields']['text'][0]
            except KeyError, e:
                print url, e, " not found in database"
            except IndexError, e:
                print url, e, " not found in database"
Example #48
0
class ESNotices(object):
    """Implementation of Elastic Search as notice backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def put(self, doc_number, notice):
        """Store a single notice"""
        self.es.index(settings.ELASTIC_SEARCH_INDEX,
                      'notice',
                      notice,
                      id=doc_number)

    def get(self, doc_number):
        """Find the associated notice"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice',
                                 doc_number)

            return result['_source']
        except ElasticHttpNotFoundError:
            return None

    def listing(self, part=None):
        """All notices or filtered by cfr_part"""
        if part:
            query = {'match': {'cfr_parts': part}}
        else:
            query = {'match_all': {}}
        query = {
            'fields': ['effective_on', 'fr_url', 'publication_date'],
            'query': query
        }
        notices = []
        results = self.es.search(query,
                                 doc_type='notice',
                                 size=100,
                                 index=settings.ELASTIC_SEARCH_INDEX)
        for notice in results['hits']['hits']:
            notice['fields']['document_number'] = notice['_id']
            notices.append(notice['fields'])
        return notices
Example #49
0
def ElasticSearchJSON(server, query, object_type, hitnum):
    '''
    Run an elasticsearch query and return JSON objects
    server: should be currently set to 'http://submit.encodedcc.org:9200'
    query: a dict formatted as specified by elasticsearch.
        the default match_all query is {'query': {'match_all': {}}}
    object_type: the name of the object type.  for example 'biosample'
        this can also be a list of object types
    hitnum: the maximum number of returned json objects
        set this as high as you can take it (10000 will do for now)
    '''
    #make instance of elastic search
    connection = ElasticSearch(server)
    # run query on server for index
    results = connection.search(query, index=object_type, size=hitnum)
    # result objects are embedded in a dict of search result metrics
    result_objects = results['hits']['hits']
    # extract the json objects from the results
    json_objects = []
    for result_object in result_objects:
        json_objects.append(result_object[u'_source'])
    return json_objects
Example #50
0
def list(config):
    """List the indices the catalog knows about, with metadata.

    Dangling index pointers are conspicuously pointed out.

    """
    secho('Current format: %s' % FORMAT, fg='green')
    echo('Catalog: %s\n' % config.es_catalog_index)

    es = ElasticSearch(config.es_hosts)
    query = {
        'query': {
            'match_all': {}
        },
        'sort': ['name', 'format']
    }
    catalog_docs = sources(es.search(query,
                                     index=config.es_catalog_index,
                                     doc_type=TREE,
                                     size=10000)['hits']['hits'])
    aliases = alias_to_index_map(es, [d['es_alias'] for d in catalog_docs])

    lines = []
    colors = []
    for d in catalog_docs:
        index_missing = d['es_alias'] not in aliases
        colors.append('red' if index_missing else
                      ('green' if d['format'] == FORMAT else None))
        lines.append([d['name'],
                      d['format'],
                      d['es_alias'],
                      'MISSING!' if index_missing else aliases[d['es_alias']],
                      d['generated_date']])
    table = tabulate(lines, headers=['Name', 'Format', 'Alias', 'Index', 'Generated'], tablefmt='simple').splitlines()
    echo(table[0])
    echo(table[1])
    for line, color in izip(table[2:], colors):
        secho(line, fg=color)
 auxStartDate = dateBeginDate + datetime.timedelta(days=i)
 auxEndDate = dateBeginDate + datetime.timedelta(days=i + 1)
 #print str(auxStartDate)
 #print str(auxEndDate)
 query = {
     'query': {
         "range": {
             "art_date": {
                 "gte": str(auxStartDate),
                 "lte": str(auxEndDate)
             }
         }
     }
 }
 #print query
 result = es.search(query, size=10000, index=index)
 for r in result['hits']['hits']:
     #print r['_source']['pub_content']
     query2 = {
         'query': {
             "bool": {
                 "must": [{
                     "match_phrase": {
                         "art_date": r['_source']['art_date']
                     }
                 }, {
                     "match": {
                         "art_name_press_source":
                         r['_source']['art_name_press_source']
                     }
                 }],
Example #52
0
class ElasticSearchProvider(SearchProvider):
    def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None):
        self.debug = False
        self.config = config
        if db is not None:
            self.db = db
        self.syncES = ElasticSearch(
            '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config
        )
        self.asyncES = ESConnection(
            host=config.get('ELASTIC_SEARCH_HOST'),
            port=config.get('ELASTIC_SEARCH_PORT'),
            io_loop=io_loop,
            protocol=config.get('ELASTIC_SEARCH_PROTOCOL'),
        )
        self.index = config.get('ELASTIC_SEARCH_INDEX')
        self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')

    def activate_debug(self):
        self.debug = True

    def connect_to_db(self):
        from sqlalchemy import create_engine
        from sqlalchemy.orm import scoped_session, sessionmaker
        conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING')
        engine = create_engine(
            conn_string,
            convert_unicode=True,
            pool_size=1,
            max_overflow=0,
            echo=self.debug
        )
        maker = sessionmaker(bind=engine, autoflush=True)
        self.db = scoped_session(maker)

    def _assemble_inner_query(self, domain=None, page_filter=None):
        if page_filter and domain:
            page_prefix = '%s/%s' % (domain.url, page_filter)
        else:
            page_prefix = None

        if page_prefix:
            return {
                'prefix': {
                    'page_url': page_prefix
                }
            }
        else:
            return {
                'match_all': {}
            }

    def _assemble_outer_query(self, inner_query, filter_terms):
        return {
            'filtered': {
                'query': inner_query,
                'filter': {
                    'and': [{
                        'term': filter_term
                    } for filter_term in filter_terms]
                }
            }
        }

    def _assemble_filter_terms(self, key_id=None, domain=None):
        filter_terms = []

        if key_id:
            filter_terms.append({'keys.id': key_id})

        if domain:
            filter_terms.append({'domain_id': domain.id})

        return filter_terms

    def gen_doc(self, review):
        return {
            'keys': [{'id': violation.key_id} for violation in review.violations],
            'uuid': str(review.uuid),
            'completed_date': review.completed_date,
            'violation_count': review.violation_count,
            'page_id': review.page_id,
            'page_uuid': str(review.page.uuid),
            'page_url': review.page.url,
            'page_last_review_date': review.page.last_review_date,
            'domain_id': review.domain_id,
            'domain_name': review.domain.name,
        }

    def index_review(self, review):
        for attempt in range(self.max_retries):
            try:
                self.syncES.send_request(
                    method='POST',
                    path_components=[self.index, 'review', review.page_id],
                    body=dumps(self.gen_doc(review)),
                    encode_body=False
                )
                break
            except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e:
                values = review.id, review.page_id, str(e)
                logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values))
                time.sleep(1)
                if attempt >= self.max_retries - 1:
                    raise
            else:
                raise

    def index_reviews(self, reviewed_pages, reviews_count, batch_size):
        action = {'index': {'_type': 'review'}}

        for i in range(0, reviews_count, batch_size):
            body_bits = []

            for page in reviewed_pages[i:i + batch_size]:
                doc = self.gen_doc(page.last_review)

                action['index']['_id'] = doc['page_id']

                body_bits.append(dumps(action))
                body_bits.append(dumps(doc))

            # Yes, that trailing newline IS necessary
            body = '\n'.join(body_bits) + '\n'

            self.syncES.send_request(
                method='POST',
                path_components=[self.index, '_bulk'],
                body=body,
                encode_body=False
            )

        logging.info('Done!')

    @return_future
    def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    reviews_data = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        reviews_data.append({
                            'uuid': hit['_source']['uuid'],
                            'page': {
                                'uuid': hit['_source']['page_uuid'],
                                'url': hit['_source']['page_url'],
                                'completedAt': completedAt
                            },
                            'domain': hit['_source']['domain_name']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviews': reviews_data,
                        'reviewsCount': reviews_count
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain, page_filter)
        filter_terms = self._assemble_filter_terms(key_id, domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'completed_date': {
                'order': 'desc'
            }
        }, {
            'violation_count': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    @return_future
    def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    pages = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        pages.append({
                            'url': hit['_source']['page_url'],
                            'uuid': hit['_source']['page_uuid'],
                            'violationCount': len(hit['_source']['keys']),
                            'completedAt': completedAt,
                            'reviewId': hit['_source']['uuid']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviewsCount': reviews_count,
                        'pages': pages
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter)
        filter_terms = self._assemble_filter_terms(domain=domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'violation_count': {
                'order': 'desc'
            }
        }, {
            'completed_date': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    def refresh(self):
        try:
            self.syncES.refresh(index=self.index)
        except Exception as e:
            logging.error('Could not refresh index (%s)' % e)

    def get_index_settings(cls):
        return {
            'index': {
                'number_of_shards': 4
            }
        }

    def get_index_mapping(cls):
        return {
            'review': {
                'properties': {
                    'keys': {
                        'properties': {
                            'id': {
                                'type': 'integer'
                            }
                        }
                    },
                    'uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'completed_date': {
                        'type': 'integer'
                    },
                    'violation_count': {
                        'type': 'float'
                    },
                    'page_id': {
                        'type': 'integer'
                    },
                    'page_uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_url': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_last_review_date': {
                        'type': 'integer'
                    },
                    'domain_id': {
                        'type': 'integer'
                    },
                    'domain_name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            }
        }

    def setup_index(self):
        try:
            settings = self.get_index_settings()
            self.syncES.create_index(index=self.index, settings=settings)
            mapping = self.get_index_mapping()
            self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping)
            logging.info('Index %s created.' % self.index)
        except Exception as e:
            raise e

    def delete_index(self):
        try:
            self.syncES.delete_index(index=self.index)
            logging.info('Index %s deleted.' % self.index)
        except Exception as e:
            raise e

    def _get_max_page_id_from_index(self, must_have_domain_name=False):
        if must_have_domain_name:
            inner_query = {
                'constant_score': {
                    'filter': {
                        'not': {
                            'missing': {
                                'field': 'domain_name'
                            }
                        }
                    }
                }
            }
        else:
            inner_query = {
                'match_all': {}
            }

        query = {
            'query': inner_query,
            'sort': [{
                'page_id': {
                    'order': 'desc'
                }
            }]
        }

        results = self.syncES.search(query, index=self.index, doc_type='review')
        if results['hits']['total'] > 0:
            return results['hits']['hits'][0]['_id'] or 0
        return 0

    def index_all_reviews(self, keys=None, batch_size=200, replace=False):
        logging.info('Querying database...')
        self.connect_to_db()

        if keys is not None:
            keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()]

        try:
            max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True)
        except Exception:
            logging.error('Could not retrieve max page_id! Use with --replace (with caution)')
            return

        def apply_filters(query):
            if keys is not None:
                query = query \
                    .filter(Violation.review_id == Page.last_review_id) \
                    .filter(Violation.key_id.in_(keys))

            if not replace:
                query = query.filter(Page.id > max_page_id)

            return query.filter(Page.last_review_id != None)

        reviews_count = apply_filters(self.db.query(func.count(Page))).scalar()

        query = self.db.query(Page).options(joinedload('last_review'))
        reviewed_pages = apply_filters(query).order_by(Page.id.asc())

        logging.info('Indexing %d reviews...' % reviews_count)

        self.index_reviews(reviewed_pages, reviews_count, batch_size)

    @classmethod
    def new_instance(cls, config):
        return ElasticSearchProvider(config)

    @classmethod
    def main(cls):
        import sys

        parser = cls.argparser()
        args = parser.parse_args()

        config = {}
        host = None
        port = None
        index = None
        es = None

        levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG']
        log_level = levels[args.verbose]
        logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s')

        if not (args.create or args.recreate or args.delete or args.keys or args.all_keys):
            parser.print_help()
            sys.exit(1)

        if args.conf:
            from derpconf.config import ConfigurationError
            from holmes.config import Config
            try:
                config = Config().load(args.conf[0])
                host = config['ELASTIC_SEARCH_HOST']
                port = config['ELASTIC_SEARCH_PORT']
                index = config['ELASTIC_SEARCH_INDEX']
            except ConfigurationError:
                logging.error('Could not load config! Use --conf conf_file')
                sys.exit(1)
            except KeyError:
                logging.error('Could not parse config! Check it\'s contents')
                sys.exit(1)

        if args.server:
            try:
                host, port = args.server[0].split(':')
                config['ELASTIC_SEARCH_HOST'] = host
                config['ELASTIC_SEARCH_PORT'] = port
            except Exception:
                logging.error('Could not parse server host and port! Use --server host:port')
                sys.exit(1)

        if args.index:
                index = args.index[0]
                config['ELASTIC_SEARCH_INDEX'] = index

        from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError
        from requests.exceptions import ConnectionError
        try:

            if args.create or args.recreate or args.delete:
                if host is None or port is None:
                    logging.error('Need either a host and port or a config file to perform such operation!')
                    sys.exit(1)
                if index is None:
                    logging.error('Need either an index name or a config file to perform such operation!')
                    sys.exit(1)
                else:
                    es = cls.new_instance(config)
                    if args.recreate or args.delete:
                        try:
                            es.delete_index()
                        except ElasticHttpNotFoundError:
                            pass
                        except InvalidJsonResponseError as e:
                            logging.error('Invalid response! Reason: %s' % e)
                            sys.exit(1)
                    if args.create or args.recreate:
                        es.setup_index()

            if args.keys or args.all_keys:
                if config is None:
                    logging.error('Need a config file to perform such operation! Use --conf conf_file')
                else:
                    batch_size = args.batch_size[0] if args.batch_size else 200
                    es = cls.new_instance(config) if not es else es
                    try:
                        if args.verbose > 2:
                            es.activate_debug()
                        if args.keys:
                            es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size)
                        elif args.all_keys:
                            es.index_all_reviews(replace=args.replace, batch_size=batch_size)
                    except InvalidJsonResponseError as e:
                        logging.error('Invalid response! Reason: %s' % e)
                        sys.exit(1)

        except IndexAlreadyExistsError:
            logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index)
        except ConnectionError:
            logging.error('Could not connect to server at %s:%s' % (host, port))
        except KeyError:
            logging.error('Could not get host nor port! Use either -conf or --server')
            sys.exit(1)
Example #53
0
class ElasticSearch(object):
    conn = None
    url = settings.ELASTICSEARCH_URL
    index_name = settings.ELASTICSEARCH_INDEX_NAME
    stdout = None
    stderr = None

    def __init__(self, index_name=None, stdout=None, stderr=None):
        self.conn = PyElasticSearch()
        if index_name:
            self.index_name = index_name
        if stdout:
            self.stdout = stdout
        if stderr:
            self.stderr = stderr

    def create_index(self, delete=True):
        if delete:
            try:
                self.conn.delete_index(self.index_name)
            except ElasticHttpNotFoundError as e:
                pass
        mappings = dict(
            (k, v) for k, v in get_elasticsearch_properties().items())
        self.conn.create_index(self.index_name,
                               settings={'mappings': mappings})

    def index_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.index_activity(activity)

    def delete_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.delete_activity(activity)

    def index_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            docs = self.get_activity_documents(activity, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(
                        doc, id=doc.pop('id'), parent=doc.pop('_parent', None))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_investor(self, investor):
        for doc_type in DOC_TYPES_INVESTOR:
            docs = self.get_investor_documents(investor, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_activity_documents(self, activity_identifiers=[]):
        activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter(
            fk_status__in=(
                HistoricalActivity.STATUS_ACTIVE,
                HistoricalActivity.STATUS_PENDING,
                HistoricalActivity.STATUS_OVERWRITTEN,
                HistoricalActivity.STATUS_DELETED)).distinct().values_list(
                    'activity_identifier', flat=True).distinct()

        for doc_type in DOC_TYPES_ACTIVITY:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i deals...' %
                (doc_type, len(activity_identifiers)))
            for activity_identifier in activity_identifiers:
                for activity in self.get_activity_versions(
                        activity_identifier):
                    docs.extend(
                        self.get_activity_documents(activity,
                                                    doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                paginator = Paginator(docs, 1000)
                for page in paginator.page_range:
                    try:
                        self.conn.bulk(
                            (self.conn.index_op(doc,
                                                id=doc.pop('id'),
                                                parent=doc.pop(
                                                    '_parent', None))
                             for doc in paginator.page(page)),
                            index=self.index_name,
                            doc_type=doc_type)
                    except BulkError as e:
                        for error in e.errors:
                            msg = '%s: %s on ID %s' % (
                                error['index']['error']['type'],
                                error['index']['error']['reason'],
                                error['index']['_id'])
                            if 'caused_by' in error['index']['error']:
                                msg += ' (%s: %s)' % (error['index']['error']
                                                      ['caused_by']['type'],
                                                      error['index']['error']
                                                      ['caused_by']['reason'])
                            self.stderr and self.stderr.write(msg)
                    self.conn.refresh()

    def index_investor_documents(self):
        investors = Investor.objects.public().order_by(
            'investor_identifier', '-id').distinct('investor_identifier')

        for doc_type in DOC_TYPES_INVESTOR:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i investors...' %
                (doc_type, investors.count()))
            for investor in investors:
                docs.extend(
                    self.get_investor_documents(investor, doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    #def index_activity_by_version(self, activity_identifier):
    #    for doc_type in get_elasticsearch_properties().keys():
    #        docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type)
    #        if len(docs) > 0:
    #            try:
    #                self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs),
    #                    index=self.index_name,
    #                    doc_type=doc_type)
    #            except BulkError as e:
    #                for error in e.errors:
    #                    stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % (
    #                            error['index']['error']['type'],
    #                            error['index']['error']['reason'],
    #                            error['index']['error']['caused_by']['type'],
    #                            error['index']['error']['caused_by']['reason'],
    #                            error['index']['_id']
    #                          ))

    def get_activity_versions(self, activity_identifier):
        versions = []
        # get the newest non-pending, readable historic version:
        try:
            newest = HistoricalActivity.objects.filter(
                activity_identifier=activity_identifier,
                fk_status__in=(
                    HistoricalActivity.STATUS_ACTIVE,
                    HistoricalActivity.STATUS_OVERWRITTEN,
                    HistoricalActivity.STATUS_DELETED)).distinct().latest()
            if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED:
                versions.append(newest)
        except HistoricalActivity.DoesNotExist:
            newest = None

        # get newer pendings
        pendings = HistoricalActivity.objects.filter(
            activity_identifier=activity_identifier,
            fk_status_id=HistoricalActivity.STATUS_PENDING).distinct()
        if newest:
            pendings.filter(history_date__gt=newest.history_date)
        versions.extend(pendings)

        return versions

    def get_activity_documents(self, activity, doc_type='deal'):
        docs = []
        deal_attrs = {
            'id': activity.id,
            'activity_identifier': activity.activity_identifier,
            'historical_activity_id': activity.id,
            'status': activity.fk_status_id,
        }

        # Todo: Is there a nice way to prevent this extra Activity query?
        # e.g. if we save is_public/deal_scope as ActivityAttributes
        public_activity = Activity.objects.filter(
            activity_identifier=activity.activity_identifier).order_by(
                '-id').first()
        if public_activity:
            deal_attrs.update({
                'is_public':
                public_activity.is_public,
                'deal_scope':
                public_activity.deal_scope,
                'deal_size':
                public_activity.deal_size,
                'current_negotiation_status':
                public_activity.negotiation_status,
                'top_investors':
                public_activity.top_investors,
                'fully_updated_date':
                public_activity.fully_updated_date,
            })
        else:
            # Fixme: This should not happen
            self.stderr and self.stderr.write(
                _('Missing activity for historical activity %i (Activity identifier: #%i)'
                  % (activity.id, activity.activity_identifier)))
        #except Activity.MultipleObjectsReturned:
        #    # Fixme: This should not happen
        #    self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % (
        #        activity.id,
        #        activity.activity_identifier
        #    )))

        for a in activity.attributes.select_related('fk_group__name').order_by(
                'fk_group__name'):
            # do not include the django object id
            if a.name == 'id':
                continue
            attribute = None
            attribute_key = '%s_attr' % a.name
            if attribute_key in get_elasticsearch_properties(
            )['deal']['properties'].keys():
                attribute = {
                    'value': a.value,
                    'value2': a.value2,
                    'date': a.date,
                    'is_current': a.is_current,
                }
            value = a.value

            # Area field?
            if a.name and 'area' in a.name and a.polygon is not None:
                # Get polygon
                #value = json.loads(a.polygon.json)
                # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work
                #value['type'] = 'multipolygon'
                value = a.polygon.json or ''
            # do not include empty values
            if value is None or value == '':
                continue

            # Doc types: location, data_source or contract
            group_match = a.fk_group and a.fk_group.name or ''
            group_match = re.match(
                '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)',
                group_match)
            if group_match:
                dt, count = group_match.groupdict()['doc_type'], int(
                    group_match.groupdict()['count'])
                if doc_type == dt:
                    while len(docs) < count:
                        docs.append({
                            '_parent': activity.activity_identifier,
                            'id': a.id,  #'%i_%i' % (a.id, count),
                        })
                    docs[count - 1][a.name] = [
                        value,
                    ]
                # Set doc type counter within deal doc type (for location/data_source/contract)
                elif doc_type == 'deal':
                    # Set counter
                    key = '%s_count' % dt
                    if key not in deal_attrs.keys():
                        deal_attrs[key] = count
                    elif deal_attrs[key] < count:
                        deal_attrs[key] = count

                    # Create list with correct length to ensure formset values have the same index
                    if not a.name in deal_attrs:
                        deal_attrs[a.name] = [''] * count
                        if attribute:
                            deal_attrs[attribute_key] = [''] * count
                    else:
                        while len(deal_attrs[a.name]) < count:
                            deal_attrs[a.name].append('')
                            if attribute:
                                deal_attrs[attribute_key].append('')
                    deal_attrs[a.name][count - 1] = value
                    if attribute:
                        deal_attrs['%s_attr' % a.name][count - 1] = attribute

            # Doc type: deal and not formset
            elif doc_type == 'deal':
                if a.name in deal_attrs:
                    deal_attrs[a.name].append(value)
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name].append(attribute)
                else:
                    deal_attrs[a.name] = [
                        value,
                    ]
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name] = [
                            attribute,
                        ]

        if doc_type == 'deal':
            # Additionally save operational company attributes
            oc = Investor.objects.filter(
                investoractivityinvolvement__fk_activity__activity_identifier=
                activity.activity_identifier)
            if oc.count() > 0:
                oc = oc.first()
                for field in Investor._meta.fields:
                    if isinstance(field, ForeignKey):
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(
                                       oc, '%s_id' % field.name)
                    else:
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(oc, field.name)
            else:
                pass
                #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier)

        # Create single document for each location
        # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now?
        spatial_names = list(get_spatial_properties())
        for i in range(deal_attrs.get('location_count', 0)):
            doc = deal_attrs.copy()
            for name in spatial_names:
                if not name in doc:
                    continue
                if len(deal_attrs[name]) > i:
                    doc[name] = deal_attrs[name][i]
                else:
                    doc[name] = ''
            # Set unique ID for location (deals can have multiple locations)
            doc['id'] = '%s_%i' % (doc['id'], i)
            point_lat = doc.get('point_lat', None)
            point_lon = doc.get('point_lon', None)
            if point_lat and point_lon:
                # Parse values
                try:
                    parsed_lat, parsed_lon = float(point_lat), float(point_lon)
                    doc['geo_point'] = '%s,%s' % (point_lat, point_lon)
                except ValueError:
                    doc['geo_point'] = '0,0'
            else:
                doc['point_lat'] = '0'
                doc['point_lon'] = '0'
                doc['geo_point'] = '0,0'
            # FIXME: we dont really need 'point_lat' and 'point_lon' here,
            # so we should pop them from doc when adding 'geo_point'
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def get_export_properties(self, doc, doc_type='deal'):
        if doc_type == 'investor':
            return ExportInvestorForm.export(doc)
        elif doc_type == 'involvement':
            return InvestorVentureInvolvementForm.export(doc)
        else:
            properties = {
                'deal_scope_export':
                doc.get('deal_scope', ''),
                'is_public_export':
                doc.get('is_public', False) and str(_('Yes')) or str(_('No')),
                'deal_size_export':
                doc.get('deal_size', ''),
                'current_negotiation_status_export':
                doc.get('current_negotiation_status', ''),
                'top_investors_export':
                doc.get('top_investors', ''),
                'fully_updated_date_export':
                doc.get('fully_updated_date', ''),
            }
            # Doc types: deal, location, contract and data_source
            for form in ChangeDealView.FORMS:
                formset_name = hasattr(form, "form") and form.Meta.name or None
                form = formset_name and form.form or form
                properties.update(form.export(doc, formset=formset_name))
            properties.update(
                ExportInvestorForm.export(doc, prefix='operational_company_'))
            return properties

    def get_investor_documents(self, investor, doc_type='investor'):
        docs = []
        # Doc types: involvement and investor
        if doc_type == 'involvement':
            ivis = InvestorVentureInvolvement.objects.filter(
                Q(fk_venture=investor) | Q(fk_investor=investor))
            for ivi in ivis:
                doc = {}
                for field in ivi._meta.local_fields:
                    if isinstance(field, ForeignKey):
                        doc[field.name] = getattr(ivi, '%s_id' % field.name)
                    else:
                        doc[field.name] = getattr(ivi, field.name)
                docs.append(doc)
        elif doc_type == 'investor':
            doc = {}
            for field in investor._meta.local_fields:
                if isinstance(field, ForeignKey):
                    doc[field.name] = getattr(investor, '%s_id' % field.name)
                else:
                    doc[field.name] = getattr(investor, field.name)
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def refresh_index(self):
        self.conn.refresh(self.index_name)

    def search(self, elasticsearch_query, doc_type='deal', sort=[]):
        """ Executes paginated queries until all results have been retrieved. 
            @return: The full list of hits. """
        start = 0
        size = 10000  # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better)
        raw_result_list = []

        done = False
        while not done:
            query = {
                'query': elasticsearch_query,
                'from': start,
                'size': size,
            }
            if sort:
                query['sort'] = sort
            query_result = self.conn.search(query,
                                            index=self.index_name,
                                            doc_type=doc_type)
            raw_result_list.extend(query_result['hits']['hits'])
            results_total = query_result['hits']['total']

            if len(raw_result_list) >= results_total:
                done = True
            else:
                start = len(raw_result_list)

        print('\nElasticsearch returned %i documents from a total of %i \n\n' %
              (len(raw_result_list), query_result['hits']['total']))
        return raw_result_list

    def delete_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            try:
                if doc_type == 'deal':
                    self.conn.delete(id=activity.activity_identifier,
                                     index=self.index_name,
                                     doc_type=doc_type)
                else:
                    self.conn.delete_by_query(query={
                        "parent_id": {
                            "type": "deal",
                            "id": str(activity.activity_identifier),
                        }
                    },
                                              index=self.index_name,
                                              doc_type=doc_type)
            except ElasticHttpNotFoundError as e:
                pass

    def get_deals_by_activity_identifier(self,
                                         activity_identifier,
                                         doc_type='deal'):
        return self.search({
            "constant_score": {
                "filter": {
                    "term": {
                        "activity_identifier": activity_identifier
                    }
                }
            }
        })
Example #54
0
#!/usr/bin/env python
from pyelasticsearch import ElasticSearch

from settings import HOST, INDEX, DOCTYPE

es = ElasticSearch(HOST)
results = es.search('*:*', index=INDEX, doc_type=DOCTYPE)
hits = results['hits']['hits']
print hits
from pyelasticsearch import ElasticSearch
es = ElasticSearch('http://localhost:9200/')
es.search('name:Russell', index='agile_data_science')
Example #56
0
es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs),
        index='test',
        doc_type='test')

es.refresh('test')

res1 = es.get('test', 'test', 1)

# 全文匹配, 注意中英文的分词方式.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html

res8 = es.search(index='test',
                 size=2,
                 query={"query": {
                     "query_string": {
                         "query": "抽"
                     }
                 }})

# 前缀匹配查询,只接受小写.
res12 = es.search(index='test', query={"query": {"prefix": {"title": "p"}}})

# search, 先must_match, filter
res2 = es.search(index='test',
                 query={
                     "query": {
                         "bool": {
                             "must": [{
                                 "match": {
                                     "name": 'Jessica'
Example #57
0
    print 100 * '-'
    print thunder_name

    # _download_link = thunder_song.download_link
    # url = kcloud + _download_link
    # req = urllib2.Request(url)
    # res = urllib2.urlopen(req)
    # res = res.read()
    # res = json.loads(res)
    # download_link = res.get('result')
    # urllib.urlretrieve(download_link,'t_music/'+thunder_name+'.ts')

    es_songs = es.search(index='song',
                         size=3,
                         query={'query': {
                             'match': {
                                 'name': thunder_name
                             }
                         }})

    es_songs = es_songs['hits']['hits']
    es_songs = [item['_source'] for item in es_songs]

    for item in es_songs:

        print item.get('id')
        print item.get('name')
        print item.get('artist')
        o2o_id = item.get('id')
        o2o_name = item.get('name')