def setUpClass(self): self.http_url = '%s:%s' % (config.ES_HOST, config.ES_HTTP_PORT) self.es_http = rawes.Elastic(url=self.http_url) self.custom_json_decoder = test_encoder.DateAwareJsonDecoder().decode if not config.HTTP_ONLY: self.thrift_url = '%s:%s' % (config.ES_HOST, config.ES_THRIFT_PORT) self.es_thrift = rawes.Elastic(url=self.thrift_url)
def test_timeouts(self): es_http_short_timeout = rawes.Elastic(url=self.http_url,timeout=0.0001) self._test_timeout(es_short_timeout=es_http_short_timeout) if not config.HTTP_ONLY: es_thrift_short_timeout = rawes.Elastic(url=self.thrift_url,timeout=0.0001) self._test_timeout(es_short_timeout=es_thrift_short_timeout)
def test_json_decoder_encoder(self): es_http_decoder = rawes.Elastic(url=self.http_url,json_decoder=self.custom_json_decoder) es_http_encoder = rawes.Elastic(url=self.http_url,json_encoder=test_encoder.encode_custom) self._test_custom_encoder(self.es_http,es_encoder=es_http_encoder) self._test_custom_decoder(self.es_http,es_decoder=es_http_decoder) if not config.HTTP_ONLY: self._reset_indices(self.es_thrift) self._wait_for_good_health(self.es_thrift) es_thrift_decoder = rawes.Elastic(url=self.thrift_url,json_decoder=self.custom_json_decoder) es_thrift_encoder = rawes.Elastic(url=self.thrift_url,json_encoder=test_encoder.encode_custom) self._test_custom_encoder(self.es_thrift,es_encoder=es_thrift_encoder) self._test_custom_decoder(self.es_thrift,es_decoder=es_thrift_decoder)
def test_create_transaction(client): # Create payload and insert data into both pycsw database and elasticsearch. payload = construct_payload(layers_list=layers_list) response = client.post('/catalog/{0}/csw'.format(catalog_slug), payload, content_type='text/xml') assert 200 == response.status_code # Provisional hack to refresh documents in elasticsearch. es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.post('/_refresh') # Verify records have been added into both pycsw. repository = registry.RegistryRepository() records_number = int(repository.query('')[0]) assert len(layers_list) == records_number # Verify records added into elasticsearch using the search api. response = client.get(catalog_search_api) assert 200 == response.status_code search_response = json.loads(response.content.decode('utf-8')) assert len(layers_list) - 1 == search_response['a.matchDocs'] assert response.get('Content-Type') == 'application/json' layer_uuid = 'f28ad41b-b91f-4d5d-a7c3-4b17dfaa5170' layer_dic, layer_id, index_name = registry.get_data_from_es( es_client, layer_uuid) assert 'layer_1 titleterm1' == layer_dic['title'] assert 'test' == index_name
class TestElasticSearch(unittest.TestCase): #by default server runs at port 9200 URL = "http://localhost:9200/" ES = rawes.Elastic('localhost:9200') def test_elastic_server_running(self): response = urllib2.urlopen(self.URL) data = json.loads(response.read().strip()) #check if it returns 200 and ok==true if data["status"] == 200 and data["ok"] == True: self.assertTrue(True) else: self.assertTrue(False) def test_elasticsearch_put(self): response = self.ES.put('tweets/tweet/1', data={ 'user': '******', 'post_date': '2012-09-25T01:40:30', 'message': 'Tweeting about elasticsearch' }) if response["_type"] == "tweet" and response[ "ok"] == True and response["_index"] == "tweets": self.assertTrue(True) else: self.assertTrue(False) def test_elasticsearch_get(self): response = self.ES.get('tweets/tweet/_search', data=""" { "query" : { "match_all" : {} } } """) if len(response["hits"]["hits"]) != 0: self.assertTrue(True) else: self.assertTrue(Flase) def test_elasticsearch_post(self): response = self.ES.post('tweets/tweet/', data={ 'user': '******', 'post_date': '2012-09-25T09:02:01', 'message': 'More tweets about elasticsearch' }) if response["_type"] == "tweet" and response[ "ok"] == True and response["_index"] == "tweets": self.assertTrue(True) else: self.assertTrue(False)
def results(): page = int(request.args.get('page', 1)) qstr = request.args.get('q', '') size = 10 es = rawes.Elastic('localhost:9200') if qstr == '': query = {'match_all': {}} else: query = {'term': {'_all': qstr}} print qstr print query results = es.get('bookmarks/bookmark/_search', data={ 'version': True, 'query': query, 'fields': ['title', 'archive_url'], 'size': size, 'from': (page * size) - size, 'sort': { '_score': 'desc' } }) pagination = Pagination(page, size, results['hits']['total'], results['hits']['hits']) #pprint(pagination.items) return render_template('srp.jinja', pagination=pagination, qstr=qstr)
def get_es(timeout=30): """ Get a handle to the configured elastic search DB """ return rawes.Elastic('%s:%s' % (settings.ELASTICSEARCH_HOST, settings.ELASTICSEARCH_PORT), timeout=timeout)
def es_connect(url=REGISTRY_SEARCH_URL): es = rawes.Elastic(url) try: version = es.get('')['version']['number'] except requests.exceptions.ConnectionError: return 'Elasticsearch connection error' return es, version
def __init__(self, connectionName='default', **connection_options): self.options = settings.SEARCH_CONNECTION.get(connectionName, {}) self.conn = rawes.Elastic( url=self.options['url'], path='', timeout=self.options['timeout'], connection_type=self.options['connection_type'], connection=None)
def get_es(timeout=30): """ Get a handle to the configured elastic search DB Returns a rawes.Elastic instance. We are hoping to deprecate and retire this method soonish. """ return rawes.Elastic('%s:%s' % (settings.ELASTICSEARCH_HOST, settings.ELASTICSEARCH_PORT), timeout=timeout)
def clear_records(): ''' Function that clears records for both database and search backend. ''' registry.REGISTRY_INDEX_NAME = 'test' yield es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.delete(registry.REGISTRY_INDEX_NAME) context = config.StaticContext() delete_records(context, registry.PYCSW['repository']['database'], registry.PYCSW['repository']['table'])
def test_single_transaction(client): # Create payload and insert data into both pycsw database and elasticsearch. payload = construct_payload(layers_list=layers_list[:2]) response = client.post('/catalog/{0}/csw'.format(catalog_slug), payload, content_type='text/xml') assert 200 == response.status_code # Provisional hack to refresh documents in elasticsearch. es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.post('/_refresh') # Verify records have been added into both pycsw. repository = registry.RegistryRepository() records_number = int(repository.query('')[0]) assert 2 == records_number # Verify records added into elasticsearch using the search api. response = client.get(catalog_search_api) assert 200 == response.status_code search_response = json.loads(response.content.decode('utf-8')) assert 2 == search_response['a.matchDocs'] assert response.get('Content-Type') == 'application/json' # Remove records using individual requests. request_string = ( '<csw:Transaction xmlns:dc="http://purl.org/dc/elements/1.1/" ' 'xmlns:ogc="http://www.opengis.net/ogc" ' 'xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" ' 'xmlns:ows="http://www.opengis.net/ows" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 ' 'http://schemas.opengis.net/csw/2.0.2/CSW-publication.xsd" service="CSW" version="2.0.2">\n' ' <csw:Delete>\n' ' <csw:Constraint version="1.1.0">\n' ' <ogc:Filter>\n' ' <ogc:PropertyIsLike wildCard="%" singleChar="." escapeChar="\">\n' ' <ogc:PropertyName>dc:identifier</ogc:PropertyName>\n' ' <ogc:Literal>{0}</ogc:Literal>\n' ' </ogc:PropertyIsLike>\n' ' </ogc:Filter>\n' ' </csw:Constraint>\n' ' </csw:Delete>\n' '</csw:Transaction>\n') response = client.post('/catalog/{0}/csw'.format(catalog_slug), request_string.format(layers_list[0]['identifier']), content_type='text/xml') assert 200 == response.status_code response = client.post('/catalog/{0}/csw'.format(catalog_slug), request_string.format(layers_list[1]['identifier']), content_type='text/xml') assert 200 == response.status_code
def __init__(self, crawl_started_at, index, doctype, url, index_suffix=None): super(ElasticSearchExporter, self).__init__(crawl_started_at, index, doctype) self.url = url self.es = rawes.Elastic(self.url) self.index_suffix = index_suffix
def run(): es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=60.0) index = getattr(es, settings.ES_INDEX) records = {'sec_cftc': {}, 'regulations.gov': {}} for doc in Doc.objects(type__in=['notice', 'proposed_rule', 'rule'], agency__in=['SEC', 'CFTC']): # first check the annotation if 'fr_data' in doc.annotations and doc.annotations['fr_data']: #print 'annotation', doc.source, doc.id, doc.annotations['fr_data']['document_number'] records[doc.source][doc.annotations['fr_data'] ['document_number']] = doc elif 'Federal_Register_Number' in doc.details: #print 'detail', doc.source, doc.id, doc.details['Federal_Register_Number'] frn = doc.details['Federal_Register_Number'] # trim leading zeros from the second part if "-" in frn: frnp = frn.split("-") frn = "-".join(frnp[:-1] + [frnp[-1].lstrip('0')]) records[doc.source][frn] = doc overlap = records['sec_cftc'].viewkeys( ) & records['regulations.gov'].viewkeys() for frid in overlap: winner = records['sec_cftc'][frid] loser = records['regulations.gov'][frid] winner_dkt = Docket.objects.get(id=winner.docket_id) loser_dkt = Docket.objects.get(id=loser.docket_id) for w, l in ((winner, loser), (winner_dkt, loser_dkt)): replaces = set(w.suppression.get('replaces', [])) replaces.add(l.id) w.suppression['replaces'] = list(replaces) replaced_by = set(l.suppression.get('replaced_by', [])) replaced_by.add(w.id) l.suppression['replaced_by'] = list(replaced_by) l.save() w.save() try: index.docket.delete(loser_dkt.id) index.document.delete(loser.id) except: pass print '%s suppresses %s' % (winner.id, loser.id)
def test_multiindex_search(): es_client = rawes.Elastic('localhost:9200') es_client.put('/index1/data/1', data={'text': 'here is some text'}) es_client.put('/index2/data/1', data={'text': 'here is more text in another index'}) # make sure docs are visible es_client.post('/_refresh') # search over multiple indexes like so response = es_client.get('/index1,index2/_search?q=*:*') assert response['hits']['total'] == 2 es_client.delete('/index1') es_client.delete('/index2')
def __init__(self, config=None): sys.path.append("%s/%s" % (os.getcwd(), 'python-dateutil')) sys.path.append("%s/%s" % (os.getcwd(), 'requests')) sys.path.append("%s/%s" % (os.getcwd(), 'rawes')) import rawes import requests from rawes.elastic_exception import ElasticException # pyflakes doesn't like globals()['ElasticException'] = ElasticException so: self.ElasticException = ElasticException if config is None: import config config = Config(config) self.config = config self.es = rawes.Elastic(config.es_url, except_on_error=True) # make sure the index exists try: # to explain the custom mapping: # * _source enabled is maybe not really needed, but it's easiest at # least. we just need to be able to reconstruct the original document. # * tags are not analyzed so that when we want to get a list of all # tags (a facet search) it returns the original tags, not the # tokenized terms. self.es.post(config.es_index, data={ "mappings": { "event": { "_source": { "enabled": True }, "properties": { "tags": { "type": "string", "index": "not_analyzed" } } } } }) print "created new ElasticSearch Index" except ElasticException as e: import re if 'IndexAlreadyExistsException' in e.result['error']: pass elif 'already exists as alias' in e.result['error']: pass else: raise except requests.exceptions.ConnectionError as e: sys.stderr.write("Could not connect to ElasticSearch: %s" % e) sys.exit(2)
def test_reindex_records(client): message, status = registry.delete_index(catalog_slug) assert 200 == status assert 'removed' in message # Test empty list of catalogs. response = client.get('/catalog') assert 404 == response.status_code assert 'Empty' in response.content.decode('utf-8') registry.create_index(catalog_slug) registry.re_index_layers(catalog_slug) # Provisional hack to refresh documents in elasticsearch. es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.post('/_refresh') response = client.get(catalog_search_api, default_params) assert 200 == response.status_code results = json.loads(response.content.decode('utf-8')) assert len(layers_list) - 1 == results['a.matchDocs']
def __init__(self, port=None, debug=False): self.debug = debug self.print_query = False self.es = None self.mapping = None self.keywords = None self.host = None self.version = None self.v5 = None if port: try: self.es = rawes.Elastic(port, headers={'content-type': 'application/json'}) self.get_mapping() self.get_version() except ConnectionError as err: print("init: cannot connect to", port) print(err) if not self.es: self.debug = True
def __init__(self, port=None, debug=False): self.debug = debug self.print_query = False #if self.debug: # requests_defaults['verbose'] = DebugPrinter() self.es = None self.mapping = None self.keywords = None self.host = None if port: try: self.es = rawes.Elastic(port) self.get_mapping() except ConnectionError as err: print("init: cannot connect to", port) print(err) if not self.es: self.debug = True
def test_catalogs(client): catalogs = ['catalog_1', 'catalog_2', 'catalog_3'] for catalog in catalogs: response = client.get('/{0}/insert'.format(catalog)) assert 200 == response.status_code assert 'Catalog {0} created succesfully'.format( catalog) == response.content.decode('utf-8') time.sleep(5) # List indices. response = client.get('/catalogs/') assert 200 == response.status_code results = json.loads(response.content.decode('utf-8')) assert len(catalogs) == len(results) es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) for catalog in catalogs: es_client.delete(catalog) # Test empty list of catalogs. response = client.get('/catalogs/') assert 200 == response.status_code assert 'empty' in response.content.decode('utf-8')
def run(options, args): import settings, regs_common es = rawes.Elastic(getattr(settings, "ES_HOST", 'thrift://localhost:9500'), timeout=30.0) index = getattr(es, settings.ES_INDEX) if options.delete: index.delete() print "Index deleted." if options.create: mapping_file = os.path.join( os.path.abspath(os.path.dirname(regs_common.__file__)), "data", "es_mapping.json") mapping_data = json.load(open(mapping_file)) index.put(data={'mappings': mapping_data}) print "Index created." stats = es._stats.get() print json.dumps(stats, indent=4) return stats
def test_load_records(client): test_create_catalog(client) repository = registry.RegistryRepository() repository.catalog = catalog_slug payload = construct_payload(layers_list=layers_list) xml_records = etree.fromstring(payload) context = config.StaticContext() registry.load_records(repository, xml_records, context) # Provisional hack to refresh documents in elasticsearch. es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.post('/_refresh') records_number = int(repository.query('')[0]) assert len(layers_list) == records_number # Verify records added into elasticsearch using the search api. response = client.get(catalog_search_api) assert 200 == response.status_code search_response = json.loads(response.content.decode('utf-8')) assert len(layers_list) == search_response['a.matchDocs'] test_clear_records(client)
from flask import Flask, render_template, request from flask.ext import restful from flask.ext.restful import abort, reqparse import rawes import re from settings import (ES_URL, ES_INDEXES, ES_DOCUMENT_TYPES_PER_INDEX, ES_DOCUMENT_TYPES, ES_VALIDATION_RESULTS_INDEX) app = Flask(__name__) api = restful.Api(app) es = rawes.Elastic(ES_URL) def get_alias_from_index(index_name): """ The indexes are named as `alias_suffix` """ return index_name.split("_",1)[0] def format_es_single_doc(es_doc): return { '_type': es_doc['_type'], '_index': get_alias_from_index(es_doc['_index']), '_id': es_doc['_id'], '_source': es_doc['_source'] } def format_es_search_results(es_results): # convert index results to aliasses for hit in es_results['hits']['hits']: hit['_index'] = get_alias_from_index(hit['_index'])
def test_q_text_fields_boost(client): test_clear_records(client) test_create_catalog(client) layers = [{ 'identifier': 10, 'title': 'alpha', 'creator': 'beta', 'lower_corner_1': -1.0, 'upper_corner_1': -1.0, 'lower_corner_2': -1.0, 'upper_corner_2': -1.0, 'i': 0, 'title_alternate': '934', 'registry_tag': 'notag_1', 'type': 'ESRI:ArcGIS:ImageServer', 'source': 'None', 'modified': datetime(2000, 3, 1, 0, 0, 0, tzinfo=registry.TIMEZONE) }, { 'identifier': 20, 'title': 'beta', 'creator': 'alpha', 'lower_corner_1': -2.0, 'upper_corner_1': -2.0, 'lower_corner_2': -2.0, 'upper_corner_2': -2.0, 'i': 1, 'title_alternate': '935', 'registry_tag': 'notag_2', 'source': 'None', 'type': 'ESRI:ArcGIS:ImageServer', 'modified': datetime(2001, 3, 1, 0, 0, 0, tzinfo=registry.TIMEZONE) }] payload = construct_payload(layers_list=layers) response = client.post('/catalog/{0}/csw'.format(catalog_slug), payload, content_type='text/xml') assert 200 == response.status_code # Provisional hack to refresh documents in elasticsearch. es_client = rawes.Elastic(registry.REGISTRY_SEARCH_URL) es_client.post('/_refresh') response = client.post('/', payload, content_type='text/xml') assert 200 == response.status_code try: # Boosting title will make doc 10 score higher params = default_params.copy() params["q_text"] = "{0}".format("alpha") params["q_text_fields"] = "{0},{1}".format("title^999.0", "layer_originator^0.1") params["d_docs_limit"] = 100 response = client.get(catalog_search_api, params) assert 200 == response.status_code results = json.loads(response.content.decode('utf-8')) assert 2 == results['a.matchDocs'] assert layers[0]['title'] == results.get("d.docs", [])[0]['title'] assert layers[1]['creator'] == results.get("d.docs", [])[1]['layer_originator'] # Boosting layer_originator will make doc 20 score higher params["q_text_fields"] = "{0},{1}".format("title^0.1", "layer_originator^9.0") response = client.get(catalog_search_api, params) assert 200 == response.status_code results = json.loads(response.content.decode('utf-8')) assert 2 == results['a.matchDocs'] assert layers[0]['title'] == results.get("d.docs", [])[1]['title'] assert layers[1]['creator'] == results.get("d.docs", [])[0]['layer_originator'] finally: test_clear_records(client) test_create_catalog(client) test_create_transaction(client)
def test_timeouts(self): es_http_short_timeout = rawes.Elastic(url=self.http_url,timeout=0.0001) es_thrift_short_timeout = rawes.Elastic(url=self.thrift_url,timeout=0.0001) self._test_timeout(es_short_timeout=es_http_short_timeout) self._test_timeout(es_short_timeout=es_thrift_short_timeout)
def test_except_on_error(self): es_http_except_on_error = rawes.Elastic(url=self.http_url,except_on_error=True) es_thrift_except_on_error = rawes.Elastic(url=self.thrift_url,except_on_error=True) self._test_except_on_error(self.es_http, es_http_except_on_error) self._test_except_on_error(self.es_thrift, es_thrift_except_on_error)
import rawes es = rawes.Elastic('localhost:9200') query = "python" es.get('dns/nyc/_search', data={ "query": { "bool": { "must": [{ "wildcard": { "answer": query } }], "must_not": [], "should": [] } }, "from": 0, "size": 50, "sort": [], "facets": {} }) query = "*google*" es.get('dns/nyc/_search', data={ "query": { "bool": { "must": [{ "wildcard": { "answer": query
def test_empty_constructor(self): es = rawes.Elastic() self.assertEqual(es.url.scheme, "http") self.assertEqual(es.url.hostname, "localhost") self.assertEqual(es.url.port, 9200)
Each plan object has two top-level elements: "metadata" and "days". "metadata" contains the original query options used to generate the plan: "calories" (number) "cuisine" (string) "ingredients" (string, may be comma-separated list) "days" contains list of objects, each of which has "breakfast", "lunch" & "dinner". Each of these are in turn an array of recipes, because a meal could consist of more than one recipe. """ result_size = 20 es = rawes.Elastic('ec2-54-216-139-182.eu-west-1.compute.amazonaws.com:9200') coursesList = ['breakfast', 'lunch', 'dinner'] courses = { 'breakfast': 'breakfast-brunch', 'lunch': 'appetizers,pasta,salads,sandwiches,soups,bread', 'dinner': 'entrees,grains,hors-d-oeuvres,legumes,pastries,pies-and-tarts,pies-and-tarts,vegetables,potatoes' # removed sauces } calorieMealRatios = {'breakfast': 0.2, 'lunch': 0.3, 'dinner': 0.5}
def setUpClass(self): self.http_url = '%s:%s' % (config.ES_HOST, config.ES_HTTP_PORT) self.es_http = rawes.Elastic(url=self.http_url) self.thrift_url = '%s:%s' % (config.ES_HOST, config.ES_THRIFT_PORT) self.es_thrift = rawes.Elastic(url=self.thrift_url)