def update_process_datetime(doc_id, timestamp): ''' Updates the last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. ''' connection_string = 'http://localhost:9200' process_index = 'openfdametadata' _type = 'last_run' _map = {} _map[_type] = {} _map[_type]['properties'] = {} _map[_type]['properties']['last_update_date'] = {} _map[_type]['properties']['last_update_date']['type'] = 'date' _map[_type]['properties']['last_update_date']['format'] = 'dateOptionalTime' es = ElasticSearch(connection_string) try: es.create_index(process_index) logging.info('Creating index %s', process_index) except exceptions.IndexAlreadyExistsError as e: logging.info('%s already exists', process_index) try: es.put_mapping(process_index, doc_type=_type, mapping=_map) logging.info('Successfully created mapping') except: logging.fatal('Could not create the mapping') new_doc = {} new_doc['last_update_date'] = timestamp es.index(process_index, doc_type=_type, id=doc_id, doc=new_doc, overwrite_existing=True)
def update_process_datetime(doc_id, timestamp): ''' Updates the last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. ''' connection_string = 'http://localhost:9200' process_index = 'openfdametadata' _type = 'last_run' _map = {} _map[_type] = {} _map[_type]['properties'] = {} _map[_type]['properties']['last_update_date'] = {} _map[_type]['properties']['last_update_date']['type'] = 'date' _map[_type]['properties']['last_update_date'][ 'format'] = 'dateOptionalTime' es = ElasticSearch(connection_string) try: es.create_index(process_index) logging.info('Creating index %s', process_index) except exceptions.IndexAlreadyExistsError as e: logging.info('%s already exists', process_index) try: es.put_mapping(process_index, doc_type=_type, mapping=_map) logging.info('Successfully created mapping') except: logging.fatal('Could not create the mapping') new_doc = {} new_doc['last_update_date'] = timestamp es.index(process_index, doc_type=_type, id=doc_id, doc=new_doc, overwrite_existing=True)
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet): """ Bulk import a delimited file into a target Elasticsearch instance. Common delimited files include things like CSV and TSV. \b Load a CSV file: csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv \b For a TSV file, note the tab delimiter option csv2es --index-name tomatoes --doc-type tomato \ --import-file tomatoes.tsv --tab \b For a nifty pipe-delimited file (delimiters must be one character): csv2es --index-name pipes --doc-type pipe --import-file pipes.psv \ --delimiter '|' """ echo('Using host: ' + host, quiet) es = ElasticSearch(host) if delete_index: try: es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: if settings_file: echo('Applying mapping from: ' + settings_file, quiet) with open(settings_file) as f: settings = json.loads(f.read()) es.create_index(index_name, settings) else: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except ElasticHttpError as e: if e.error['type'] == 'index_already_exists_exception': echo('Index ' + index_name + ' already exists', quiet) else: raise echo('Using document type: ' + doc_type, quiet) if mapping_file: echo('Applying mapping from: ' + mapping_file, quiet) with open(mapping_file) as f: mapping = json.loads(f.read()) es.put_mapping(index_name, doc_type, mapping) target_delimiter = sanitize_delimiter(delimiter, tab) documents = documents_from_file(es, import_file, target_delimiter, quiet) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel)
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host, docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file, user, passwd): with open(config_file, "rb") as f: con = json.loads(f.read()) host = con['es_config']['host'] echo('Using host: ' + host, quiet) es = ElasticSearch(host) if con['db']['type'] == "oracle": db = import_module('cx_Oracle') collection = db.connect(user, passwd, con['db']['con_str']) else: db = import_module('MySQLdb') collection = db.connect(con['db']['con_str'][0], user, passwd, con['db']['con_str'][1], charset=con['db']['con_str'][2]) if delete_index: # 删除索引 try: stamp = 0 es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: if settings_file: with open(settings_file, 'r') as f: settings_json = json.loads(f.read()) es.create_index(index_name, settings=settings_json) else: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except Exception: echo('Index ' + index_name + ' already exists', quiet) echo('Using document type: ' + doc_type, quiet) es.put_mapping(index_name, doc_type, con['mapping']) parser_fun = None if parser is not None: # 加载解释函数 parser_fun = import_module(PARSER_PATH + '.' + parser) documents = documents_from_file(es, collection, quiet, parser_fun, con) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel) print "end:" + time.strftime( ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
def cli(index_name, delete_index, mapping_file, doc_type, import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet, document_id_in_file): """ Bulk import a delimited file into a target Elasticsearch instance. Common delimited files include things like CSV and TSV. \b Load a CSV file: csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv \b For a TSV file, note the tab delimiter option csv2es --index-name tomatoes --doc-type tomato --import-file tomatoes.tsv --tab \b For a nifty pipe-delimited file (delimiters must be one character): csv2es --index-name pipes --doc-type pipe --import-file pipes.psv --delimiter '|' """ echo('Using host: ' + host, quiet) es = ElasticSearch(host) if delete_index: try: es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except IndexAlreadyExistsError: echo('Index ' + index_name + ' already exists', quiet) except ElasticHttpError as exception: echo( 'Error creating index %s. ElasticHttpError [%s]' % (index_name, exception.error), quiet) echo('Using document type: ' + doc_type, quiet) if mapping_file: echo('Applying mapping from: ' + mapping_file, quiet) with open(mapping_file) as f: mapping = json.loads(f.read()) es.put_mapping(index_name, doc_type, mapping) target_delimiter = sanitize_delimiter(delimiter, tab) documents = documents_from_file(es, import_file, target_delimiter, quiet, document_id_in_file) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel)
def init_schema(): """Should be called at application startup. Makes sure the mappings and index exist.""" es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) try: es.create_index(settings.ELASTIC_SEARCH_INDEX) except IndexAlreadyExistsError: pass # Does not replace if exact mapping already exists es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', {'reg_tree': { 'properties': NODE_SEARCH_SCHEMA }}) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer', {'layer': { 'properties': LAYER_SCHEMA }}) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice', {'notice': { 'properties': LAYER_SCHEMA }}) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff', {'diff': { 'properties': DIFF_SCHEMA }})
def init_schema(): """Should be called at application startup. Makes sure the mappings and index exist.""" es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) try: es.create_index(settings.ELASTIC_SEARCH_INDEX) except IndexAlreadyExistsError: pass # Does not replace if exact mapping already exists es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', { 'reg_tree': {'properties': NODE_SEARCH_SCHEMA} }) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer', { 'layer': {'properties': LAYER_SCHEMA} }) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice', { 'notice': {'properties': LAYER_SCHEMA} }) es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff', { 'diff': {'properties': DIFF_SCHEMA} })
class ElasticConnector(Connector): """ Class for connectors that are operate with elasticsearch database """ MAX_SIZE = 1000 def __init__(self, database, host='http://localhost:9200/'): self.client = ElasticSearch(host) self.index = database self.create_index() def query_to_id(self, query): """ Returns id representation of a specified query This is a temporary method as a replacement of elasticsearch query search """ return "_".join(str(k) + "_" + str(v) for k, v in query.items()).replace("/", "_") def create_index(self): """ Creates specified index or catches an exception if it has already been created """ try: self.client.create_index(self.index) except Exception as e: pass def set_dynamic_mapping(self, collection): """ Sets dynamic mapping for a specified document type """ self.client.put_mapping(self.index, collection, {'dynamic': True}) def save_block(self, block): """ Saves operation info in a database """ super().save_block(block) collection = block.get_collection() dictionary = block.to_dict() query = block.get_query() self.update_by_query(collection, query, block) def update_by_query(self, collection, query, document): """ Sets dynamic mapping for a specified collection, then creates a new id for a document depending on query for it. Saves a new object in a database as a new one """ try: self.set_dynamic_mapping(collection) document_id = document.get_id() document_body = document.to_dict() if "_id" in document_body.keys(): del document_body['_id'] self.client.index(self.index, collection, document_body, id=self.query_to_id(query)) except Exception as e: print(e) pass def find_last_block(self): """ Finds last block index as a value field of a document in a status collection with specified id """ try: document = self.client.get(self.index, 'status', 'height_all_tsx')['_source'] return document['value'] except ElasticHttpNotFoundError as e: return 0 def update_last_block(self, last_block): """ Updates last block index as a value field of a document in a status collection with specified id """ self.client.index(self.index, 'status', {'value': last_block}, id='height_all_tsx') def save_instance(self, instance): """ Saves account or comment object """ self.update_by_query(instance.get_collection(), instance.get_query(), instance) def get_instances_to_update(self, collection): """ Finds and returns all dictionaries with objects that should be updated """ hits = self.client.search("need_update:true", index=self.index, doc_type=collection, size=self.MAX_SIZE)['hits']['hits'] return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits] def update_instances(self, collection, instances): """ Resets need_update flag for all instances in a list by their ids in _id field """ for instance in instances: self.client.update(self.index, collection, instance["_id"], doc={'need_update': False})
s=ElasticSearch("http://localhost:9200") if "init" in sys.argv : try : s.delete_index("flights"); except Exception, e: print e try : s.create_index("flights") except Exception, e: print e else : print "Created flights" s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}')) def md(a) : a["datum"]=a["starttime"][:10] return a def makets(a) : for f in ("starttime","endtime") : a[f]=maket(a[f]) return a
class ElasticSearchProvider(SearchProvider): def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES') def activate_debug(self): self.debug = True def connect_to_db(self): from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING') engine = create_engine( conn_string, convert_unicode=True, pool_size=1, max_overflow=0, echo=self.debug ) maker = sessionmaker(bind=engine, autoflush=True) self.db = scoped_session(maker) def _assemble_inner_query(self, domain=None, page_filter=None): if page_filter and domain: page_prefix = '%s/%s' % (domain.url, page_filter) else: page_prefix = None if page_prefix: return { 'prefix': { 'page_url': page_prefix } } else: return { 'match_all': {} } def _assemble_outer_query(self, inner_query, filter_terms): return { 'filtered': { 'query': inner_query, 'filter': { 'and': [{ 'term': filter_term } for filter_term in filter_terms] } } } def _assemble_filter_terms(self, key_id=None, domain=None): filter_terms = [] if key_id: filter_terms.append({'keys.id': key_id}) if domain: filter_terms.append({'domain_id': domain.id}) return filter_terms def gen_doc(self, review): return { 'keys': [{'id': violation.key_id} for violation in review.violations], 'uuid': str(review.uuid), 'completed_date': review.completed_date, 'violation_count': review.violation_count, 'page_id': review.page_id, 'page_uuid': str(review.page.uuid), 'page_url': review.page.url, 'page_last_review_date': review.page.last_review_date, 'domain_id': review.domain_id, 'domain_name': review.domain.name, } def index_review(self, review): for attempt in range(self.max_retries): try: self.syncES.send_request( method='POST', path_components=[self.index, 'review', review.page_id], body=dumps(self.gen_doc(review)), encode_body=False ) break except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e: values = review.id, review.page_id, str(e) logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values)) time.sleep(1) if attempt >= self.max_retries - 1: raise else: raise def index_reviews(self, reviewed_pages, reviews_count, batch_size): action = {'index': {'_type': 'review'}} for i in range(0, reviews_count, batch_size): body_bits = [] for page in reviewed_pages[i:i + batch_size]: doc = self.gen_doc(page.last_review) action['index']['_id'] = doc['page_id'] body_bits.append(dumps(action)) body_bits.append(dumps(doc)) # Yes, that trailing newline IS necessary body = '\n'.join(body_bits) + '\n' self.syncES.send_request( method='POST', path_components=[self.index, '_bulk'], body=body, encode_body=False ) logging.info('Done!') @return_future def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) reviews_data = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) reviews_data.append({ 'uuid': hit['_source']['uuid'], 'page': { 'uuid': hit['_source']['page_uuid'], 'url': hit['_source']['page_url'], 'completedAt': completedAt }, 'domain': hit['_source']['domain_name'] }) reviews_count = hits.get('total', 0) callback({ 'reviews': reviews_data, 'reviewsCount': reviews_count }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain, page_filter) filter_terms = self._assemble_filter_terms(key_id, domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'completed_date': { 'order': 'desc' } }, { 'violation_count': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) @return_future def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) pages = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) pages.append({ 'url': hit['_source']['page_url'], 'uuid': hit['_source']['page_uuid'], 'violationCount': len(hit['_source']['keys']), 'completedAt': completedAt, 'reviewId': hit['_source']['uuid'] }) reviews_count = hits.get('total', 0) callback({ 'reviewsCount': reviews_count, 'pages': pages }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter) filter_terms = self._assemble_filter_terms(domain=domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'violation_count': { 'order': 'desc' } }, { 'completed_date': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) def refresh(self): try: self.syncES.refresh(index=self.index) except Exception as e: logging.error('Could not refresh index (%s)' % e) def get_index_settings(cls): return { 'index': { 'number_of_shards': 4 } } def get_index_mapping(cls): return { 'review': { 'properties': { 'keys': { 'properties': { 'id': { 'type': 'integer' } } }, 'uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'completed_date': { 'type': 'integer' }, 'violation_count': { 'type': 'float' }, 'page_id': { 'type': 'integer' }, 'page_uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'page_url': { 'type': 'string', 'index': 'not_analyzed' }, 'page_last_review_date': { 'type': 'integer' }, 'domain_id': { 'type': 'integer' }, 'domain_name': { 'type': 'string', 'index': 'not_analyzed' } } } } def setup_index(self): try: settings = self.get_index_settings() self.syncES.create_index(index=self.index, settings=settings) mapping = self.get_index_mapping() self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping) logging.info('Index %s created.' % self.index) except Exception as e: raise e def delete_index(self): try: self.syncES.delete_index(index=self.index) logging.info('Index %s deleted.' % self.index) except Exception as e: raise e def _get_max_page_id_from_index(self, must_have_domain_name=False): if must_have_domain_name: inner_query = { 'constant_score': { 'filter': { 'not': { 'missing': { 'field': 'domain_name' } } } } } else: inner_query = { 'match_all': {} } query = { 'query': inner_query, 'sort': [{ 'page_id': { 'order': 'desc' } }] } results = self.syncES.search(query, index=self.index, doc_type='review') if results['hits']['total'] > 0: return results['hits']['hits'][0]['_id'] or 0 return 0 def index_all_reviews(self, keys=None, batch_size=200, replace=False): logging.info('Querying database...') self.connect_to_db() if keys is not None: keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()] try: max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True) except Exception: logging.error('Could not retrieve max page_id! Use with --replace (with caution)') return def apply_filters(query): if keys is not None: query = query \ .filter(Violation.review_id == Page.last_review_id) \ .filter(Violation.key_id.in_(keys)) if not replace: query = query.filter(Page.id > max_page_id) return query.filter(Page.last_review_id != None) reviews_count = apply_filters(self.db.query(func.count(Page))).scalar() query = self.db.query(Page).options(joinedload('last_review')) reviewed_pages = apply_filters(query).order_by(Page.id.asc()) logging.info('Indexing %d reviews...' % reviews_count) self.index_reviews(reviewed_pages, reviews_count, batch_size) @classmethod def new_instance(cls, config): return ElasticSearchProvider(config) @classmethod def main(cls): import sys parser = cls.argparser() args = parser.parse_args() config = {} host = None port = None index = None es = None levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG'] log_level = levels[args.verbose] logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s') if not (args.create or args.recreate or args.delete or args.keys or args.all_keys): parser.print_help() sys.exit(1) if args.conf: from derpconf.config import ConfigurationError from holmes.config import Config try: config = Config().load(args.conf[0]) host = config['ELASTIC_SEARCH_HOST'] port = config['ELASTIC_SEARCH_PORT'] index = config['ELASTIC_SEARCH_INDEX'] except ConfigurationError: logging.error('Could not load config! Use --conf conf_file') sys.exit(1) except KeyError: logging.error('Could not parse config! Check it\'s contents') sys.exit(1) if args.server: try: host, port = args.server[0].split(':') config['ELASTIC_SEARCH_HOST'] = host config['ELASTIC_SEARCH_PORT'] = port except Exception: logging.error('Could not parse server host and port! Use --server host:port') sys.exit(1) if args.index: index = args.index[0] config['ELASTIC_SEARCH_INDEX'] = index from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError from requests.exceptions import ConnectionError try: if args.create or args.recreate or args.delete: if host is None or port is None: logging.error('Need either a host and port or a config file to perform such operation!') sys.exit(1) if index is None: logging.error('Need either an index name or a config file to perform such operation!') sys.exit(1) else: es = cls.new_instance(config) if args.recreate or args.delete: try: es.delete_index() except ElasticHttpNotFoundError: pass except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) if args.create or args.recreate: es.setup_index() if args.keys or args.all_keys: if config is None: logging.error('Need a config file to perform such operation! Use --conf conf_file') else: batch_size = args.batch_size[0] if args.batch_size else 200 es = cls.new_instance(config) if not es else es try: if args.verbose > 2: es.activate_debug() if args.keys: es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size) elif args.all_keys: es.index_all_reviews(replace=args.replace, batch_size=batch_size) except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) except IndexAlreadyExistsError: logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index) except ConnectionError: logging.error('Could not connect to server at %s:%s' % (host, port)) except KeyError: logging.error('Could not get host nor port! Use either -conf or --server') sys.exit(1)
"type": "float", "null_value": 0.0 }, "Runtime": { "type": "integer" }, "Type": { "type": "string", "index": "not_analyzed" }, "Rated": { "type": "string", "index": "not_analyzed" }, "imdbID": { "type": "string", "index": "not_analyzed" }, "metadata": { "type": "string", "index": "not_analyzed" }, "queue": { "type": "string", "index": "not_analyzed" } } } } print es.put_mapping("prime", "video", mapping)
input = len(sys.argv) if input < 2: usage() sys.exit(1) else: qname = sys.argv[1] from pyelasticsearch import ElasticSearch es = ElasticSearch(elasticsearch) try: s = es.status('oplog') except: print "Creating index: oplog" try: s = es.create_index('oplog') print "sleeping for 5 to ensure index exists" time.sleep(5) except: print "ERROR: index creation failed!" sys.exit() print "Creating queue: %s" % qname try: es.put_mapping('oplog',qname,{"properties" : { "from" : {"type" : "string", "null_value" : "na"}, "sent" : {"type" : "string", "null_value" : "na"}, "submitted" : {"type" : "date"}, "subject" : {"type" : "string", "null_value" : "na"}, "message" : {"type" : "string", "null_value" : "na"} }}) print "Created queue with mapping:" print es.get_mapping('oplog',qname) except: print "ERROR: queue creation failed!"
"state": {"type" : "string"}, "country": {"type" : "string"}, "name": {"type" : "string"}, "description": {"type" : "string"}, "logo": {"type" : "string"}, "twitter": {"type" : "string"}, "station_site": {"type" : "string"}, "primary_genre": {"type" : "string", "index" : "not_analyzed"}, "frequency": {"type" : "string"}, "shoutcast_url": {"type" : "string"}, "location": {"type" : "geo_point"}, "geojson": {"type" : "string", "index" : "no"}, } } } es.put_mapping(INDEX_NAME, "station", STATION_MAPPING) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36' } failures = 0 pk = 0 while failures < 200: pk += 1 r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk, headers=headers) if r.status_code != 200: if r.status_code > 500: print("[%d] %d" % (r.status_code, pk))
class Elastic(DataLayer): """ElasticSearch data layer.""" serializers = { 'integer': int, 'datetime': parse_date } def init_app(self, app): app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') app.config.setdefault('ELASTICSEARCH_INDEX', 'eve') self.es = ElasticSearch(app.config['ELASTICSEARCH_URL']) self.index = app.config['ELASTICSEARCH_INDEX'] def _get_field_mapping(self, schema): """Get mapping for given field schema.""" if schema['type'] == 'datetime': return {'type': 'date'} elif schema['type'] == 'string' and schema.get('unique'): return {'type': 'string', 'index': 'not_analyzed'} elif schema['type'] == 'string': return {'type': 'string'} def put_mapping(self, app): """Put mapping for elasticsearch for current schema. It's not called automatically now, but rather left for user to call it whenever it makes sense. """ for resource, resource_config in app.config['DOMAIN'].items(): properties = {} properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'}) properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'}) for field, schema in resource_config['schema'].items(): field_mapping = self._get_field_mapping(schema) if field_mapping: properties[field] = field_mapping datasource = (resource, ) # TODO: config.SOURCES not available yet (self._datasource_ex(resource)) mapping = {} mapping[datasource[0]] = {'properties': properties} self.es.put_mapping(self.index, datasource[0], mapping) def find(self, resource, req, sub_resource_lookup): """ TODO: implement sub_resource_lookup """ query = { 'query': { 'query_string': { 'query': request.args.get('q', '*'), 'default_field': request.args.get('df', '_all'), 'default_operator': 'AND' } } } if not req.sort and self._default_sort(resource): req.sort = self._default_sort(resource) # skip sorting when there is a query to use score if req.sort and 'q' not in request.args: query['sort'] = [] sort = ast.literal_eval(req.sort) for (key, sortdir) in sort: sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')]) query['sort'].append(sort_dict) if req.where: where = json.loads(req.where) if where: query['filter'] = { 'term': where } if req.max_results: query['size'] = req.max_results if req.page > 1: query['from'] = (req.page - 1) * req.max_results source_config = config.SOURCES[resource] if 'facets' in source_config: query['facets'] = source_config['facets'] try: args = self._es_args(resource) args['es_fiels'] = self._fields(resource) return self._parse_hits(self.es.search(query, **args), resource) except es_exceptions.ElasticHttpError: return ElasticCursor() def find_one(self, resource, **lookup): args = self._es_args(resource) args['es_fields'] = self._fields(resource) if config.ID_FIELD in lookup: try: hit = self.es.get(id=lookup[config.ID_FIELD], **args) except es_exceptions.ElasticHttpNotFoundError: return if not hit['exists']: return doc = hit.get('fields', hit.get('_source', {})) doc['_id'] = hit.get('_id') convert_dates(doc, self._dates(resource)) return doc else: query = { 'query': { 'constant_score': { 'filter': { 'term': lookup } } } } try: args['size'] = 1 docs = self._parse_hits(self.es.search(query, **args), resource) return docs.first() except es_exceptions.ElasticHttpNotFoundError: return None def find_list_of_ids(self, resource, ids, client_projection=None): args = self._es_args(resource) args['es_fields'] = self._fields(resource) return self._parse_hits(self.es.multi_get(ids, **args), resource) def insert(self, resource, doc_or_docs, **kwargs): ids = [] kwargs.update(self._es_args(resource)) for doc in doc_or_docs: doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs)) ids.append(doc['_id']) self.es.refresh(self.index) return ids def update(self, resource, id_, updates): args = self._es_args(resource, refresh=True) return self.es.update(id=id_, doc=updates, **args) def replace(self, resource, id_, document): args = self._es_args(resource, refresh=True) args['overwrite_existing'] = True return self.es.index(document=document, id=id_, **args) def remove(self, resource, id_=None): args = self._es_args(resource, refresh=True) if id_: return self.es.delete(id=id_, **args) else: try: return self.es.delete_all(**args) except es_exceptions.ElasticHttpNotFoundError: return def _parse_hits(self, hits, resource): """Parse hits response into documents.""" return ElasticCursor(hits, self._dates(resource)) def _es_args(self, resource, refresh=None): """Get index and doctype args.""" datasource = self._datasource(resource) args = { 'index': self.index, 'doc_type': datasource[0], } if refresh: args['refresh'] = refresh return args def _fields(self, resource): """Get projection fields for given resource.""" datasource = self._datasource(resource) keys = datasource[2].keys() return ','.join(keys) def _default_sort(self, resource): datasource = self._datasource(resource) return datasource[3] def _dates(self, resource): dates = [config.LAST_UPDATED, config.DATE_CREATED] datasource = self._datasource(resource) schema = config.DOMAIN[datasource[0]]['schema'] for field, field_schema in schema.items(): if field_schema['type'] == 'datetime': dates.append(field) return dates
from pyelasticsearch import ElasticSearch os.environ.setdefault("DJANGO_SETTINGS_MODULE", "timesketch.settings") from django.contrib.auth.models import User from timesketch.apps.sketch.models import Timeline user = User.objects.get(id=2) es_server = sys.argv[1] es_port = sys.argv[2] name = sys.argv[3] index = sys.argv[4] es = ElasticSearch("http://%s:%s" % (es_server, es_port)) mapping = { "plaso_event": { u'properties': { u'timesketch_label': { "type": "nested"} } }, } es.put_mapping(index, "plaso_event", mapping) timeline = Timeline.objects.create(owner=user, title=name, description=name, datastore_index=index) timeline.make_public()
class IbbdElasticSearch: """ es操作 文档:http://pyelasticsearch.readthedocs.io/en/latest/ """ es = None config = {} mapping_is_set = False # 判断是否已经设置了es的mapping def __init__(self, config): """ es初始化 配置参数: host: es连接字符串 indexName: index的名字 deleteIndex: 是否删除已经存在的index,默认为false,不删除 settings: index的配置。具体的配置项,请看es的文档。 settingsFile: index的配置,json文件。具体的配置项,请看es的文档。 mappings: mappings的配置。具体的配置项,请看es的文档。 mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。 idField: id字段。有些数据是包含id字段的 说明:settings和settingsFile最多只能有一项 mappings和mappingsFile最多也只能有一项 """ self.es = ElasticSearch(config['host']) if 'docType' not in config: config['docType'] = config['indexName'] self.config = config if 'deleteIndex' in config and config['deleteIndex']: try: self.es.delete_index(config['indexName']) print('delete index ' + config['indexName'] + ' success!') except ElasticHttpNotFoundError: # 如果本来不存在,则输出提示就好 print('Index ' + config['indexName'] \ + ' not found, nothing to delete!') except: raise Exception('Index ' + config['indexName'] + ' delete error!') try: if 'settings' in config: self.es.create_index(config['indexName'], settings=config['settings']) elif 'settingsFile' in config: with open(config['settingsFile'], 'r') as f: config['settings'] = json.loads(f.read()) self.es.create_index(config['indexName'], settings=config['settings']) else: self.es.create_index(config['indexName']) print('create index ' + config['indexName'] + ' success!') except Exception: raise Exception("create index " + config['indexName'] + ' error!') def _putMapping(self, row): """ 设置es的mapping。 可以根据row生成默认配置, 生成配置规则如下: """ try: if 'mappingsFile' in self.config: with open(self.config['mappingsFile'], 'r') as f: self.config['mappings'] = json.loads(f.read()) if 'mappings' in self.config: self.es.put_mapping(self.config['indexName'], self.config['docType'], self.config['mappings']) print("put mapping " + self.config['indexName'] + ' success!') except Exception: raise Exception("put mapping " + self.config['indexName'] + ' error!') def read(self): pass def batchRead(self): pass def write(self, row): """ 写入单行记录 """ return self.batchWrite([row]) def batchWrite(self, rows): """ 写入多行记录 """ if not self.mapping_is_set: # 设置mapping self.mapping_is_set = True self._putMapping(rows[0]) docs = () if 'idField' in self.config: docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \ for doc in rows) else: docs = (self.es.index_op(doc) for doc in rows) self.es.bulk(docs, index=self.config['indexName'], doc_type=self.config['docType']) return True
class LBRest(): def __init__(self, base=None, idx_exp_url=None, txt_mapping=None, cfg_idx=None): """Serve para cosumir o LBG e o ES.""" self.base = base self.idx_exp_url = idx_exp_url if self.idx_exp_url is not None: self.idx_exp_host = idx_exp_url.split('/')[2] self.idx_exp_index = idx_exp_url.split('/')[3] self.idx_exp_type = idx_exp_url.split('/')[4] self.es = ElasticSearch("http://" + self.idx_exp_host) self.txt_mapping = txt_mapping self.cfg_idx = cfg_idx self.con_refsd = False def get_index(self, bases_list): """Obter a a configuração de indexação p/ as bases.""" bases_indexes = [] for base in bases_list: idx_exp_url = base['metadata']['idx_exp_url'] nm_idx = idx_exp_url.split('/')[3] url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx req = None try: req = requests.get(url_txt_idx) req.raise_for_status() idx_resp = req.json() except requests.exceptions.HTTPError as e: if e.response.status_code == 404: # NOTE: Para os casos onde não há configuração de # indexação setada na rota "_txt_idx"! By Questor idx_resp = None else: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Falha HTTP ao tentar obter configuração de "\ "índice textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Erro ao tentar obter a configuração de índice "\ "textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] bases_indexes.append({"base": base, "index": idx_resp}) return bases_indexes def get_bases(self): """Get all bases which has to index registries.""" # NOTE: A construção logo abaixo tá meio tosca. O objetivo é # checar se na estrutura de dados da table "lb_base" já está # o atributo (campo struct) e o campo "txt_mapping". Se não # tiver, tenta obter a base com todos os campos. Trata-se de # um "workaround" sendo o correto que a estrutura de dados # na table "lb_base" esteja atualizada! By Questor bases = [ ] req = None try: params = """{ "select": [ "name", "idx_exp_time", "idx_exp_url", "txt_mapping" ], "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) if config.FORCE_INDEX == True: data = [ ] results = dict({ u'metadata' : { u'idx_exp_url' : u''+config.ES_URL+'', u'name' : u''+config.NM_BASE+'', u'idx_exp_time' : u''+config.TIME_IDX+'' } }) data.append(results) bases = data else: req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: bases = [ ] req = None try: params = """{ "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: # NOTE: A variável de instância "self.con_refsd" # serve p/ evitar que o aviso mais abaixo seja # exibido repetidamente detonando o log! By Questor if self.con_refsd: return bases # NOTE: Estou usando '"Connection refused" in str(e)' # pq "raise_for_status()" mais acima não retorna uma # exceção do tipo "requests.exceptions.HTTPError" de # forma q possamos usar o código em "status_code" # tratar erro de forma mais específica! By Questor if "Connection refused" in str(e) and not self.con_refsd: logger.error('Erro ao obter a lista bases para '\ 'indexação. URL: %s. FALHA: Servidor indisponivel! '\ 'HTTPCode: 502 (Connection refused)!' % (config.REST_URL)) self.con_refsd = True return bases self.con_refsd = False fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error( ("Erro ao obter a lista bases para indexação. " "URL: %s. FALHA: %s") % ( config.REST_URL, fail_content)) return bases def get_passed_registries(self): """Retorna registros da base de log erros de indexação. Apenas "id_doc_orig" e "dt_last_up_orig". """ # NOTE: Cria base de log se não existir! By Questor self.create_log_base() registries = [ ] params = {'$$':"""{ "select":["id_doc_orig", "dt_last_up_orig"], "literal": "nm_base = '%s'", "limit": null }""" % self.base} url = config.REST_URL + '/log_lbindex/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 1 Erro ao recuperar registros da base %s'. FALHA: %s """ % ('log_lbindex', fail_content)) resp = {} for reg in registries: resp[reg['id_doc_orig']] = reg['dt_last_up_orig'] return resp def get_registries(self): """Retorna registros à serem indexados que sob certos critérios não tenham falhado no passado. """ # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor registries = [ ] if config.FORCE_INDEX: params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'} else: params = { '$$':'{"select":["id_doc", "dt_last_up"], \ "literal":"dt_idx is null", "limit": %d}' } params.update(result_count='false') params['$$'] = params['$$'] % config.DEFAULT_LIMIT url = config.REST_URL + '/' + self.base + '/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao recuperar registros da base %s'. FALHA: %s """ % (self.base, fail_content)) ''' TODO: Essa lógica poderia ser mais eficiente... A princípio vejo duas soluções... 1 - Guardar em cache (mais complicada); 2 - Trazer apenas os registros (id_doc) envolvidos no processo de indexação atual. By Questor ''' ''' TODO: Esse método "self.get_passed_registries()" deveria ser chamado sempre? Mesmo quando a operação é "create"? Checar melhor... By Questor ''' # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor passed = self.get_passed_registries() _registries = [ ] for reg in registries: if reg['_metadata']['id_doc'] in passed: ''' NOTE: O objetivo aqui é checar se o registro está no log de erros (registros que tentou-se indexar no passado) e se estiver ignora-os a não ser que a data de "update" do registro registrado na base de logs seja diferente da data atual do registro, nesses casos o LBIndex vai tentar novamente! By Questor ''' ''' NOTE: No dict "passed" consta apenas o valor do campo "dt_last_up_orig" da base "log_lbindex"! By Questor ''' dt_last_up = passed[reg['_metadata']['id_doc']] if dt_last_up != reg['_metadata']['dt_last_up']: _registries.append(reg) else: _registries.append(reg) return _registries def get_full_reg(self, id, dt_last_up): """Obtêm o registro doc mais textos extraídos dos arquivos anexos se houverem. """ # TODO: Registrar essa ação no log toda "santa vez"? By Questor logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base)) response = None url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full' req = None try: req = requests.get(url) req.raise_for_status() response = req.json() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao recuperar registro %s na base %s'. FALHA: %s """ % (str(id), self.base, fail_content) # TODO: Pq duas chamadas as logs? By Questor logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return response def es_create_mapping(self): """Cria um mapping p/ uma base se houver configuração p/ isso.""" response_0 = None response_0_json = None index_url = None try: index_url = ("http://" + self.idx_exp_host + "/" + self.idx_exp_index + "/" + self.idx_exp_type) response_0 = requests.get(index_url + "/_mapping") response_0.raise_for_status() response_0_json = response_0.json() except requests.exceptions.HTTPError as e: # NOTE: Normalmente entrará nesse bloco de código # quando o índice não existe! By Questor self.es_create_index() except requests.exceptions.RequestException as e: raise Exception("Problem in the mapping provider! " + str(e)) except Exception as e: raise Exception("Mapping operation. Program error! " + str(e)) if (response_0.status_code == 200 and not response_0_json and (self.txt_mapping is not None and self.txt_mapping)): response_1 = None try: response_1 = self.es.put_mapping( index=self.idx_exp_index, doc_type=self.idx_exp_type, mapping=self.txt_mapping) if (response_1 is None or response_1.get("acknowledged", None) is None or response_1.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar mapping! " + str(response_1)) except Exception as e: raise Exception("Mapping creation error! " + str(e)) def es_create_index(self): """Criar um índice p/ a base com as configurações setadas, não havendo criar um índice genérico. """ response_0 = None try: cfg_idx_holder = None # NOTE: Se não houver configuração de indexação "setada" # o sistema vai criar uma padrão! By Questor if self.cfg_idx is not None and self.cfg_idx: cfg_idx_holder = self.cfg_idx else: cfg_idx_holder = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } response_0 = self.es.create_index(index=self.idx_exp_index, settings=cfg_idx_holder) if (response_0 is None or response_0.get("acknowledged", None) is None or response_0.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar index! " + str(response_0)) self.es_create_mapping() except IndexAlreadyExistsError as e: self.es_create_mapping() except Exception as e: raise Exception("Index creation error! " + str(e)) def index_member(self, registry, id, dt_last_up): """Criar o índice textual para cada registro.""" logger.info( 'Indexando registro %s da base %s na url %s ...' % ( str(id), self.base, self.idx_exp_url)) try: # NOTE: Trata e cria os mappings e index textuais! # By Questor self.es_create_mapping() self.es.index(self.idx_exp_index, self.idx_exp_type, registry, id=id) return True except Exception as e: error_msg = ("Erro ao indexar registro %s da base %s na url %s'. " "Mensagem de erro: %s") % ( str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) # TODO: Pq dois logs? By Questor self.write_error(id, dt_last_up, error_msg) return False def update_dt_index(self, id, dt_last_up): """Atualizar a data de atualização da indexação textual do registro.""" logger.info('Alterando data de indexacao do '\ 'registro %s da base %s ...' % (str(id), self.base)) params = {'value': datetime.datetime.now().\ strftime('%d/%m/%Y %H:%M:%S')} url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx') req = None try: req = requests.put(url, params=params) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = 'Erro ao alterar data de indexacao do registro %s na '\ 'base %s. FALHA: %s' % (str(id), self.base, fail_content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def write_error(self, id_doc, dt_last_up, error_msg): """Write errors to LightBase.""" error = { 'nm_base': self.base, 'id_doc_orig': id_doc, 'error_msg': error_msg, 'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'dt_last_up_orig': dt_last_up } url = config.REST_URL + '/log_lbindex/doc' data = {'value': json.dumps(error)} req = None try: req = requests.post(url, data=data) req.raise_for_status() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 0 Erro ao tentar escrever erro no Lightbase. FALHA: %s """ % fail_content) def get_errors(self): """Get all bases which has to index registries.""" errors = [ ] params = """{ "literal": "base = '%s'", "limit": 250 }""" % (self.base) url = config.REST_URL + '/_index_error' req = None try: req = requests.get(url, params={'$$':params}) req.raise_for_status() response = req.json() errors = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s """ % (url, fail_content)) return errors # TODO: Esse método serve para criar/atualizar p/ uma # indexação (index) padrão! No momento está "desvirtuado", # pois basta apagar o índice p/ q ele seja recriado com a # indexação setada na rota "_txt_idx"! Creio que esse # método não faz muito sentido aqui. Sugiro remover! # By Questor def create_index(self): """Cria índice com as opções de mapeamento padrão Atualiza o índice se já estiver criado. """ settings = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } http, space, address, _index, _type = self.idx_exp_url.split('/') try: result = self.es.create_index( index=_index, settings=settings ) except IndexAlreadyExistsError as e: logger.info("O índice já existe. Tentando atualizar o mapping...") self.es.close_index(index=_index) result = self.es.update_settings( index=_index, settings=settings ) logger.info("Mapping atualizado com sucesso. Abrindo o índice...") self.es.open_index(index=_index) logger.info("Índice reaberto com sucesso!") def delete_index(self, registry): """Deletar registros no index.""" id = registry['id_doc'] try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.delete(_index, _type, id=id) return True except ElasticHttpNotFoundError as e: return True except Exception as e: error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\ 'Mensagem de erro: %s' % \ (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) return False def delete_error(self, registry): """Deletar registro de erros na rota '_index_error'.""" url = (config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""") url = url % (registry['base'], registry['id_doc']) logger.info('Deletando registro de erro de indice na url %s' % url) req = None try: req = requests.delete(url) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao deletar erro de indice. FALHA: %s """ % (fail_content) logger.error(error_msg) return False @staticmethod def create_log_base(): """Cria base de log do LBIndex caso não exista.""" log_base = model.LogBase() response = log_base.get_base() if not response: # NOTE: Cria a base já que ela não existe! logger.info("Criando base de log do índice...") result = log_base.create_base() if result is None: logger.error("Erro na criação da base de log: \n%s", response.text) return False else: logger.info("Base de log criada com sucesso!") return True
"type": "string" } }, "_timestamp": { "enabled": True }, "_parent": { "type": "station" }, "_ttl": { "enabled": True, "default": "7d" } } } es.put_mapping(settings.ES_INDEX, 'play', play_mapping) query = { "query": { "filtered": { "query": { "match_all": {} }, "filter": { "bool": { "must": {}, "should": {}, "must_not": { "missing": { "field": "shoutcast_url", "existence": True,
from pyelasticsearch import ElasticSearch es = ElasticSearch("http://localhost:9200/") mapping = { "video" : { "properties" : { "title" : { "type" : "string" }, "year" : { "type" : "integer" }, "image_url" : { "type" : "string", "index" : "not_analyzed" }, "amazon_url" : { "type" : "string", "index" : "not_analyzed" }, "Genre" : { "type" : "string", "index" : "not_analyzed" }, "Metascore" : { "type" : "float", "null_value" : 0.0 }, "imdbRating" : { "type" : "float", "null_value" : 0.0 }, "Runtime" : { "type" : "integer" }, "Type" : { "type" : "string", "index" : "not_analyzed" }, "Rated" : { "type" : "string", "index" : "not_analyzed" }, "imdbID" : { "type" : "string", "index" : "not_analyzed" }, "metadata" : { "type" : "string", "index" : "not_analyzed" }, "queue" : { "type" : "string", "index" : "not_analyzed" } } } } print es.put_mapping("prime", "video", mapping)
class SearchIndex(object): def __init__(self, model): self.es = ElasticSearch() self.model = model def put_mapping(self, index, doc_type): mapping = { doc_type: { "properties": { "location": { "type": "geo_point" }, } } } self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping) def bulk_items(self, index, doc_type): for m in self.model.objects.all(): self.es.bulk([ self.es.index_op({ "pk": m.pk, "name": m.name, "rating": m.rating, "address": m.address, "description": m.description, "location": { "lon": m.longitude, "lat": m.latitude } }), ], doc_type=doc_type, index=index) def search(self, index, question, longitude, latitude, size=10): #self.es.delete_index(index) try: self.es.create_index(index) self.put_mapping(index, "place") self.bulk_items(index, "place") except IndexAlreadyExistsError: pass query = { "query": { "function_score": { "query": { "bool": { "should": [ {"match": {"name": question}}, {"match": {"_all": { "query": question, "operator": "or", "fuzziness": "auto", "zero_terms_query": "all" }}} ] } }, "functions": [ {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}}, ] } } } if longitude and longitude is not None: query['query']['function_score']['functions'] = [ {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"} }}, {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"} }}, ] results = self.es.search(query, index=index, size=size) self.es.refresh() return results
"itunesTrackId": {"type": "long"}, "song_spot": {"type": "string"} }, "_timestamp": { "enabled": True }, "_parent": { "type": "station" }, "_ttl": { "enabled": True, "default" : "7d" } } } es.put_mapping(settings.ES_INDEX, 'play', play_mapping) query = { "query":{ "filtered":{ "query":{ "match_all":{} }, "filter":{ "bool":{ "must":{}, "should":{}, "must_not":{ "missing":{ "field": "shoutcast_url", "existence": True,