def delete(config, tree_names, all, force): """Delete indices and their catalog entries. This deletes the indices that have the format version of the copy of DXR this runs under. """ es = ElasticSearch(config.es_hosts) if all: echo('Deleting catalog...') es.delete_index(config.es_catalog_index) # TODO: Delete tree indices as well. else: for tree_name in tree_names: frozen_id = '%s/%s' % (FORMAT, tree_name) try: frozen = es.get(config.es_catalog_index, TREE, frozen_id) except ElasticHttpNotFoundError: raise ClickException('No tree "%s" in catalog.' % tree_name) # Delete the index first. That way, if that fails, we can still # try again; we won't have lost the catalog entry. Refresh is # infrequent enough that we wouldn't avoid a race around a # catalogued but deleted instance the other way around. try: es.delete_index(frozen['_source']['es_alias']) except ElasticHttpNotFoundError: # It's already gone. Fine. Just remove the catalog entry. pass es.delete(config.es_catalog_index, TREE, frozen_id)
def updateWithPhone(messageid,phone,lat,lon): es_client = ElasticSearch("http://ec2-54-219-169-37.us-west-1.compute.amazonaws.com:9200") producer1 = mark4deletionByLatLon(es_client, 'messages', 'myMessages', lat, lon) markid = producer1.marking() #if the rider/driver is still available, remove the corresponding initiating driver/rider and confirm. if markid != 1: producer2 = updateByMessageid(es_client, 'messages', 'myMessages', messageid) success = producer2.updating() if success == 1: message = "You have been selected by other driver/rider." else: es_client.delete('messages', 'myMessages', markid) message = "Both driver and rider have been confirmed! Please contact " + phone else: message = "The driver/rider isn't available anymore. Please select again!" return render_template("confirm.html", message = message)
'query': { "bool": { "must": [{ "match_phrase": { "art_date": r['_source']['art_date'] } }, { "match": { "art_name_press_source": r['_source']['art_name_press_source'] } }], "must_not": { "match": { "_id": r['_id'] } } } } } partialResult = es.search(query2, size=paginationSize, index=index) #print partialResult['hits']['total'] for pr in partialResult['hits']['hits']: try: es.delete(index, index, pr['_id']) except: pass # #es.delete(index,index,r['_id']) #print result['hits']['total']
class ElasticSearch(object): conn = None url = settings.ELASTICSEARCH_URL index_name = settings.ELASTICSEARCH_INDEX_NAME stdout = None stderr = None def __init__(self, index_name=None, stdout=None, stderr=None): self.conn = PyElasticSearch() if index_name: self.index_name = index_name if stdout: self.stdout = stdout if stderr: self.stderr = stderr def create_index(self, delete=True): if delete: try: self.conn.delete_index(self.index_name) except ElasticHttpNotFoundError as e: pass mappings = dict( (k, v) for k, v in get_elasticsearch_properties().items()) self.conn.create_index(self.index_name, settings={'mappings': mappings}) def index_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.index_activity(activity) def delete_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.delete_activity(activity) def index_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: docs = self.get_activity_documents(activity, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op( doc, id=doc.pop('id'), parent=doc.pop('_parent', None)) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_investor(self, investor): for doc_type in DOC_TYPES_INVESTOR: docs = self.get_investor_documents(investor, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_activity_documents(self, activity_identifiers=[]): activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter( fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_PENDING, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().values_list( 'activity_identifier', flat=True).distinct() for doc_type in DOC_TYPES_ACTIVITY: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i deals...' % (doc_type, len(activity_identifiers))) for activity_identifier in activity_identifiers: for activity in self.get_activity_versions( activity_identifier): docs.extend( self.get_activity_documents(activity, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: paginator = Paginator(docs, 1000) for page in paginator.page_range: try: self.conn.bulk( (self.conn.index_op(doc, id=doc.pop('id'), parent=doc.pop( '_parent', None)) for doc in paginator.page(page)), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % (error['index']['error'] ['caused_by']['type'], error['index']['error'] ['caused_by']['reason']) self.stderr and self.stderr.write(msg) self.conn.refresh() def index_investor_documents(self): investors = Investor.objects.public().order_by( 'investor_identifier', '-id').distinct('investor_identifier') for doc_type in DOC_TYPES_INVESTOR: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i investors...' % (doc_type, investors.count())) for investor in investors: docs.extend( self.get_investor_documents(investor, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) #def index_activity_by_version(self, activity_identifier): # for doc_type in get_elasticsearch_properties().keys(): # docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type) # if len(docs) > 0: # try: # self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), # index=self.index_name, # doc_type=doc_type) # except BulkError as e: # for error in e.errors: # stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % ( # error['index']['error']['type'], # error['index']['error']['reason'], # error['index']['error']['caused_by']['type'], # error['index']['error']['caused_by']['reason'], # error['index']['_id'] # )) def get_activity_versions(self, activity_identifier): versions = [] # get the newest non-pending, readable historic version: try: newest = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().latest() if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED: versions.append(newest) except HistoricalActivity.DoesNotExist: newest = None # get newer pendings pendings = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status_id=HistoricalActivity.STATUS_PENDING).distinct() if newest: pendings.filter(history_date__gt=newest.history_date) versions.extend(pendings) return versions def get_activity_documents(self, activity, doc_type='deal'): docs = [] deal_attrs = { 'id': activity.id, 'activity_identifier': activity.activity_identifier, 'historical_activity_id': activity.id, 'status': activity.fk_status_id, } # Todo: Is there a nice way to prevent this extra Activity query? # e.g. if we save is_public/deal_scope as ActivityAttributes public_activity = Activity.objects.filter( activity_identifier=activity.activity_identifier).order_by( '-id').first() if public_activity: deal_attrs.update({ 'is_public': public_activity.is_public, 'deal_scope': public_activity.deal_scope, 'deal_size': public_activity.deal_size, 'current_negotiation_status': public_activity.negotiation_status, 'top_investors': public_activity.top_investors, 'fully_updated_date': public_activity.fully_updated_date, }) else: # Fixme: This should not happen self.stderr and self.stderr.write( _('Missing activity for historical activity %i (Activity identifier: #%i)' % (activity.id, activity.activity_identifier))) #except Activity.MultipleObjectsReturned: # # Fixme: This should not happen # self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % ( # activity.id, # activity.activity_identifier # ))) for a in activity.attributes.select_related('fk_group__name').order_by( 'fk_group__name'): # do not include the django object id if a.name == 'id': continue attribute = None attribute_key = '%s_attr' % a.name if attribute_key in get_elasticsearch_properties( )['deal']['properties'].keys(): attribute = { 'value': a.value, 'value2': a.value2, 'date': a.date, 'is_current': a.is_current, } value = a.value # Area field? if a.name and 'area' in a.name and a.polygon is not None: # Get polygon #value = json.loads(a.polygon.json) # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work #value['type'] = 'multipolygon' value = a.polygon.json or '' # do not include empty values if value is None or value == '': continue # Doc types: location, data_source or contract group_match = a.fk_group and a.fk_group.name or '' group_match = re.match( '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)', group_match) if group_match: dt, count = group_match.groupdict()['doc_type'], int( group_match.groupdict()['count']) if doc_type == dt: while len(docs) < count: docs.append({ '_parent': activity.activity_identifier, 'id': a.id, #'%i_%i' % (a.id, count), }) docs[count - 1][a.name] = [ value, ] # Set doc type counter within deal doc type (for location/data_source/contract) elif doc_type == 'deal': # Set counter key = '%s_count' % dt if key not in deal_attrs.keys(): deal_attrs[key] = count elif deal_attrs[key] < count: deal_attrs[key] = count # Create list with correct length to ensure formset values have the same index if not a.name in deal_attrs: deal_attrs[a.name] = [''] * count if attribute: deal_attrs[attribute_key] = [''] * count else: while len(deal_attrs[a.name]) < count: deal_attrs[a.name].append('') if attribute: deal_attrs[attribute_key].append('') deal_attrs[a.name][count - 1] = value if attribute: deal_attrs['%s_attr' % a.name][count - 1] = attribute # Doc type: deal and not formset elif doc_type == 'deal': if a.name in deal_attrs: deal_attrs[a.name].append(value) if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name].append(attribute) else: deal_attrs[a.name] = [ value, ] if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name] = [ attribute, ] if doc_type == 'deal': # Additionally save operational company attributes oc = Investor.objects.filter( investoractivityinvolvement__fk_activity__activity_identifier= activity.activity_identifier) if oc.count() > 0: oc = oc.first() for field in Investor._meta.fields: if isinstance(field, ForeignKey): deal_attrs['operational_company_%s' % field.name] = getattr( oc, '%s_id' % field.name) else: deal_attrs['operational_company_%s' % field.name] = getattr(oc, field.name) else: pass #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier) # Create single document for each location # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now? spatial_names = list(get_spatial_properties()) for i in range(deal_attrs.get('location_count', 0)): doc = deal_attrs.copy() for name in spatial_names: if not name in doc: continue if len(deal_attrs[name]) > i: doc[name] = deal_attrs[name][i] else: doc[name] = '' # Set unique ID for location (deals can have multiple locations) doc['id'] = '%s_%i' % (doc['id'], i) point_lat = doc.get('point_lat', None) point_lon = doc.get('point_lon', None) if point_lat and point_lon: # Parse values try: parsed_lat, parsed_lon = float(point_lat), float(point_lon) doc['geo_point'] = '%s,%s' % (point_lat, point_lon) except ValueError: doc['geo_point'] = '0,0' else: doc['point_lat'] = '0' doc['point_lon'] = '0' doc['geo_point'] = '0,0' # FIXME: we dont really need 'point_lat' and 'point_lon' here, # so we should pop them from doc when adding 'geo_point' docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def get_export_properties(self, doc, doc_type='deal'): if doc_type == 'investor': return ExportInvestorForm.export(doc) elif doc_type == 'involvement': return InvestorVentureInvolvementForm.export(doc) else: properties = { 'deal_scope_export': doc.get('deal_scope', ''), 'is_public_export': doc.get('is_public', False) and str(_('Yes')) or str(_('No')), 'deal_size_export': doc.get('deal_size', ''), 'current_negotiation_status_export': doc.get('current_negotiation_status', ''), 'top_investors_export': doc.get('top_investors', ''), 'fully_updated_date_export': doc.get('fully_updated_date', ''), } # Doc types: deal, location, contract and data_source for form in ChangeDealView.FORMS: formset_name = hasattr(form, "form") and form.Meta.name or None form = formset_name and form.form or form properties.update(form.export(doc, formset=formset_name)) properties.update( ExportInvestorForm.export(doc, prefix='operational_company_')) return properties def get_investor_documents(self, investor, doc_type='investor'): docs = [] # Doc types: involvement and investor if doc_type == 'involvement': ivis = InvestorVentureInvolvement.objects.filter( Q(fk_venture=investor) | Q(fk_investor=investor)) for ivi in ivis: doc = {} for field in ivi._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(ivi, '%s_id' % field.name) else: doc[field.name] = getattr(ivi, field.name) docs.append(doc) elif doc_type == 'investor': doc = {} for field in investor._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(investor, '%s_id' % field.name) else: doc[field.name] = getattr(investor, field.name) docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def refresh_index(self): self.conn.refresh(self.index_name) def search(self, elasticsearch_query, doc_type='deal', sort=[]): """ Executes paginated queries until all results have been retrieved. @return: The full list of hits. """ start = 0 size = 10000 # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better) raw_result_list = [] done = False while not done: query = { 'query': elasticsearch_query, 'from': start, 'size': size, } if sort: query['sort'] = sort query_result = self.conn.search(query, index=self.index_name, doc_type=doc_type) raw_result_list.extend(query_result['hits']['hits']) results_total = query_result['hits']['total'] if len(raw_result_list) >= results_total: done = True else: start = len(raw_result_list) print('\nElasticsearch returned %i documents from a total of %i \n\n' % (len(raw_result_list), query_result['hits']['total'])) return raw_result_list def delete_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: try: if doc_type == 'deal': self.conn.delete(id=activity.activity_identifier, index=self.index_name, doc_type=doc_type) else: self.conn.delete_by_query(query={ "parent_id": { "type": "deal", "id": str(activity.activity_identifier), } }, index=self.index_name, doc_type=doc_type) except ElasticHttpNotFoundError as e: pass def get_deals_by_activity_identifier(self, activity_identifier, doc_type='deal'): return self.search({ "constant_score": { "filter": { "term": { "activity_identifier": activity_identifier } } } })
class LBRest(): def __init__(self, base=None, idx_exp_url=None): self.base = base self.idx_exp_url = idx_exp_url if self.idx_exp_url is not None: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es = ElasticSearch('/'.join([http, space, address])) def get_bases(self): """ Get all bases which has to index registries """ bases = [ ] params = """{ "select": [ "name", "idx_exp_time", "idx_exp_url" ], "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) try: req.raise_for_status() response = req.json() bases = response["results"] except: logger.error(""" Erro ao tentar recuperar bases. url: %s. Reposta: %s """ % (config.REST_URL, req._content)) return bases def get_passed_registries(self): """ Realiza leitura da base de log de indexação """ # Cria base de log se não existir self.create_log_base() registries = [ ] params = {'$$':"""{ "select":["id_doc_orig", "dt_last_up_orig"], "literal": "nm_base = '%s'", "limit": null }""" % self.base } url = config.REST_URL + '/log_lbindex/doc' req = requests.get(url, params=params) try: req.raise_for_status() response = req.json() registries = response["results"] except: logger.error(""" Erro ao recuperar registros da base %s'. Resposta: %s """ % ('log_lbindex', req._content)) resp = {} for reg in registries: resp[reg['id_doc_orig']] = reg['dt_last_up_orig'] return resp #return {reg['id_doc_orig']: reg['dt_last_up_orig'] for reg in registries} def get_registries(self): """Função que lista todos os registros a serem indexados""" registries = [ ] if config.FORCE_INDEX: params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'} else: params = {'$$':'{"select":["id_doc", "dt_last_up"],"literal":"dt_idx is null", "limit": %d}'} params.update(result_count='false') params['$$'] = params['$$'] % config.DEFAULT_LIMIT url = config.REST_URL + '/' + self.base + '/doc' req = requests.get(url, params=params) try: req.raise_for_status() response = req.json() registries = response["results"] except: logger.error(""" Erro ao recuperar registros da base %s'. Resposta: %s """ % (self.base, req._content)) # Erro ao recuperar registros da base docs_pro'. Resposta: {"status": 500, # "request": {"path": "/api/docs_pro/doc", "client_addr": "10.72.246.21", # "user_agent": "python-requests/2.3.0 CPython/2.6.6 Linux/2.6.32-504.el6.x86_64", # "method": "GET"}, "error_message": "SearchError: (OperationalError) could not # connect to server: No route to host\n\tIs the server running on host \"10.72.247.144\" # and accepting\n\tTCP/IP connections on port 5432?\n None None", "type": "Exception"} passed = self.get_passed_registries() _registries = [ ] for reg in registries: if reg['_metadata']['id_doc'] in passed: dt_last_up = passed[reg['_metadata']['id_doc']] if dt_last_up != reg['_metadata']['dt_last_up']: _registries.append(reg) else: _registries.append(reg) return _registries def get_full_reg(self, id, dt_last_up): logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base)) response = None url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full' req = requests.get(url) try: req.raise_for_status() response = req.json() except: error_msg = """ Erro ao recuperar registro %s na base %s'. Resposta: %s """ % (str(id), self.base, req._content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return response def index_member(self, registry, id, dt_last_up): logger.info('Indexando registro %s da base %s na url %s ...' % (str(id), self.base, self.idx_exp_url)) try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.index(_index, _type, registry, id=id) return True except Exception as e: error_msg = """ Erro ao indexar registro %s da base %s na url %s'. Mensagem de erro: %s """ % (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def update_dt_index(self, id, dt_last_up): logger.info('Alterando data de indexacao do registro %s da base %s ...' % (str(id), self.base)) params = {'value': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')} url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx' req = requests.put(url, params=params) try: req.raise_for_status() return True except: error_msg = """ Erro ao alterar data de indexacao do registro %s na base %s'. Resposta: %s """ % (str(id), self.base, req._content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def write_error(self, id_doc, dt_last_up, error_msg): """ Write errors to LightBase """ error = { 'nm_base': self.base, 'id_doc_orig': id_doc, 'error_msg': error_msg, 'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'dt_last_up_orig': dt_last_up } url = config.REST_URL + '/log_lbindex/doc' data = {'value': json.dumps(error)} req = requests.post(url, data=data) try: req.raise_for_status() except: logger.error(""" Erro ao tentar escrever erro no Lightbase. Reposta: %s """ % req._content) def get_errors(self): """ Get all bases which has to index registries """ errors = [ ] params = """{ "literal": "base = '%s'", "limit": 250 }""" % (self.base) url = config.REST_URL + '/_index_error' req = requests.get(url, params={'$$':params}) try: req.raise_for_status() response = req.json() errors = response["results"] except: logger.error(""" Erro ao tentar recuperar erros de indice. url: %s. Reposta: %s """ % (url, req._content)) return errors def create_index(self): """ Cria índice com as opções de mapeamento padrão Atualiza o índice se já estiver criado """ settings = { "settings": { # "number_of_shards": "5", # "number_of_replicas": "1", "analysis.analyzer.default.filter.0": "lowercase", "analysis.analyzer.default.filter.1": "asciifolding", "analysis.analyzer.default.tokenizer": "standard", "analysis.analyzer.default.type": "custom", "analysis.filter.pt_stemmer.type": "stemmer", "analysis.filter.pt_stemmer.name": "portuguese" }, "mappings": { "document": { "_timestamp": { "enabled": "true" } } } } http, space, address, _index, _type = self.idx_exp_url.split('/') try: result = self.es.create_index( index=_index, settings=settings ) except IndexAlreadyExistsError as e: logger.info("O índice já existe. Tentando atualizar o mapping...") self.es.close_index(index=_index) result = self.es.update_settings( index=_index, settings=settings ) logger.info("Mapping atualizado com sucesso. Abrindo o índice...") self.es.open_index(index=_index) logger.info("Índice reaberto com sucesso!") def delete_index(self, registry): id = registry['id_doc'] try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.delete(_index, _type, id=id) return True except ElasticHttpNotFoundError as e: return True except Exception as e: error_msg = """ Erro ao deletar indice %s da base %s na url %s'. Mensagem de erro: %s """ % (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) return False def delete_error(self, registry): url = config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""" url = url % (registry['base'], registry['id_doc']) logger.info('Deletando registro de erro de indice na url %s' % url) req = requests.delete(url) try: req.raise_for_status() return True except: error_msg = """ Erro ao deletar erro de indice. Resposta: %s """ % (req._content) logger.error(error_msg) return False @staticmethod def create_log_base(): """ Cria base de log do índice caso não exista """ log_base = model.LogBase() response = log_base.get_base() if not response: # Cria a base já que ela não existe logger.info("Criando base de log do índice...") result = log_base.create_base() if result is None: logger.error("Erro na criação da base de log: \n%s", response.text) return False else: logger.info("Base de log criada com sucesso!") return True
class Elastic(DataLayer): """ElasticSearch data layer.""" serializers = { 'integer': int, 'datetime': parse_date } def init_app(self, app): app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') app.config.setdefault('ELASTICSEARCH_INDEX', 'eve') self.es = ElasticSearch(app.config['ELASTICSEARCH_URL']) self.index = app.config['ELASTICSEARCH_INDEX'] def _get_field_mapping(self, schema): """Get mapping for given field schema.""" if schema['type'] == 'datetime': return {'type': 'date'} elif schema['type'] == 'string' and schema.get('unique'): return {'type': 'string', 'index': 'not_analyzed'} elif schema['type'] == 'string': return {'type': 'string'} def put_mapping(self, app): """Put mapping for elasticsearch for current schema. It's not called automatically now, but rather left for user to call it whenever it makes sense. """ for resource, resource_config in app.config['DOMAIN'].items(): properties = {} properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'}) properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'}) for field, schema in resource_config['schema'].items(): field_mapping = self._get_field_mapping(schema) if field_mapping: properties[field] = field_mapping datasource = (resource, ) # TODO: config.SOURCES not available yet (self._datasource_ex(resource)) mapping = {} mapping[datasource[0]] = {'properties': properties} self.es.put_mapping(self.index, datasource[0], mapping) def find(self, resource, req, sub_resource_lookup): """ TODO: implement sub_resource_lookup """ query = { 'query': { 'query_string': { 'query': request.args.get('q', '*'), 'default_field': request.args.get('df', '_all'), 'default_operator': 'AND' } } } if not req.sort and self._default_sort(resource): req.sort = self._default_sort(resource) # skip sorting when there is a query to use score if req.sort and 'q' not in request.args: query['sort'] = [] sort = ast.literal_eval(req.sort) for (key, sortdir) in sort: sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')]) query['sort'].append(sort_dict) if req.where: where = json.loads(req.where) if where: query['filter'] = { 'term': where } if req.max_results: query['size'] = req.max_results if req.page > 1: query['from'] = (req.page - 1) * req.max_results source_config = config.SOURCES[resource] if 'facets' in source_config: query['facets'] = source_config['facets'] try: args = self._es_args(resource) args['es_fiels'] = self._fields(resource) return self._parse_hits(self.es.search(query, **args), resource) except es_exceptions.ElasticHttpError: return ElasticCursor() def find_one(self, resource, **lookup): args = self._es_args(resource) args['es_fields'] = self._fields(resource) if config.ID_FIELD in lookup: try: hit = self.es.get(id=lookup[config.ID_FIELD], **args) except es_exceptions.ElasticHttpNotFoundError: return if not hit['exists']: return doc = hit.get('fields', hit.get('_source', {})) doc['_id'] = hit.get('_id') convert_dates(doc, self._dates(resource)) return doc else: query = { 'query': { 'constant_score': { 'filter': { 'term': lookup } } } } try: args['size'] = 1 docs = self._parse_hits(self.es.search(query, **args), resource) return docs.first() except es_exceptions.ElasticHttpNotFoundError: return None def find_list_of_ids(self, resource, ids, client_projection=None): args = self._es_args(resource) args['es_fields'] = self._fields(resource) return self._parse_hits(self.es.multi_get(ids, **args), resource) def insert(self, resource, doc_or_docs, **kwargs): ids = [] kwargs.update(self._es_args(resource)) for doc in doc_or_docs: doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs)) ids.append(doc['_id']) self.es.refresh(self.index) return ids def update(self, resource, id_, updates): args = self._es_args(resource, refresh=True) return self.es.update(id=id_, doc=updates, **args) def replace(self, resource, id_, document): args = self._es_args(resource, refresh=True) args['overwrite_existing'] = True return self.es.index(document=document, id=id_, **args) def remove(self, resource, id_=None): args = self._es_args(resource, refresh=True) if id_: return self.es.delete(id=id_, **args) else: try: return self.es.delete_all(**args) except es_exceptions.ElasticHttpNotFoundError: return def _parse_hits(self, hits, resource): """Parse hits response into documents.""" return ElasticCursor(hits, self._dates(resource)) def _es_args(self, resource, refresh=None): """Get index and doctype args.""" datasource = self._datasource(resource) args = { 'index': self.index, 'doc_type': datasource[0], } if refresh: args['refresh'] = refresh return args def _fields(self, resource): """Get projection fields for given resource.""" datasource = self._datasource(resource) keys = datasource[2].keys() return ','.join(keys) def _default_sort(self, resource): datasource = self._datasource(resource) return datasource[3] def _dates(self, resource): dates = [config.LAST_UPDATED, config.DATE_CREATED] datasource = self._datasource(resource) schema = config.DOMAIN[datasource[0]]['schema'] for field, field_schema in schema.items(): if field_schema['type'] == 'datetime': dates.append(field) return dates
class LBRest(): def __init__(self, base=None, idx_exp_url=None, txt_mapping=None, cfg_idx=None): """Serve para cosumir o LBG e o ES.""" self.base = base self.idx_exp_url = idx_exp_url if self.idx_exp_url is not None: self.idx_exp_host = idx_exp_url.split('/')[2] self.idx_exp_index = idx_exp_url.split('/')[3] self.idx_exp_type = idx_exp_url.split('/')[4] self.es = ElasticSearch("http://" + self.idx_exp_host) self.txt_mapping = txt_mapping self.cfg_idx = cfg_idx self.con_refsd = False def get_index(self, bases_list): """Obter a a configuração de indexação p/ as bases.""" bases_indexes = [] for base in bases_list: idx_exp_url = base['metadata']['idx_exp_url'] nm_idx = idx_exp_url.split('/')[3] url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx req = None try: req = requests.get(url_txt_idx) req.raise_for_status() idx_resp = req.json() except requests.exceptions.HTTPError as e: if e.response.status_code == 404: # NOTE: Para os casos onde não há configuração de # indexação setada na rota "_txt_idx"! By Questor idx_resp = None else: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Falha HTTP ao tentar obter configuração de "\ "índice textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error("Erro ao tentar obter a configuração de índice "\ "textual! URL: %s. FALHA: %s" % (config.REST_URL, fail_content)) return [] bases_indexes.append({"base": base, "index": idx_resp}) return bases_indexes def get_bases(self): """Get all bases which has to index registries.""" # NOTE: A construção logo abaixo tá meio tosca. O objetivo é # checar se na estrutura de dados da table "lb_base" já está # o atributo (campo struct) e o campo "txt_mapping". Se não # tiver, tenta obter a base com todos os campos. Trata-se de # um "workaround" sendo o correto que a estrutura de dados # na table "lb_base" esteja atualizada! By Questor bases = [ ] req = None try: params = """{ "select": [ "name", "idx_exp_time", "idx_exp_url", "txt_mapping" ], "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) if config.FORCE_INDEX == True: data = [ ] results = dict({ u'metadata' : { u'idx_exp_url' : u''+config.ES_URL+'', u'name' : u''+config.NM_BASE+'', u'idx_exp_time' : u''+config.TIME_IDX+'' } }) data.append(results) bases = data else: req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: bases = [ ] req = None try: params = """{ "literal": "idx_exp is true", "limit": null }""" req = requests.get(config.REST_URL, params={'$$':params}) req.raise_for_status() response = req.json() bases = response["results"] except Exception as e: # NOTE: A variável de instância "self.con_refsd" # serve p/ evitar que o aviso mais abaixo seja # exibido repetidamente detonando o log! By Questor if self.con_refsd: return bases # NOTE: Estou usando '"Connection refused" in str(e)' # pq "raise_for_status()" mais acima não retorna uma # exceção do tipo "requests.exceptions.HTTPError" de # forma q possamos usar o código em "status_code" # tratar erro de forma mais específica! By Questor if "Connection refused" in str(e) and not self.con_refsd: logger.error('Erro ao obter a lista bases para '\ 'indexação. URL: %s. FALHA: Servidor indisponivel! '\ 'HTTPCode: 502 (Connection refused)!' % (config.REST_URL)) self.con_refsd = True return bases self.con_refsd = False fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error( ("Erro ao obter a lista bases para indexação. " "URL: %s. FALHA: %s") % ( config.REST_URL, fail_content)) return bases def get_passed_registries(self): """Retorna registros da base de log erros de indexação. Apenas "id_doc_orig" e "dt_last_up_orig". """ # NOTE: Cria base de log se não existir! By Questor self.create_log_base() registries = [ ] params = {'$$':"""{ "select":["id_doc_orig", "dt_last_up_orig"], "literal": "nm_base = '%s'", "limit": null }""" % self.base} url = config.REST_URL + '/log_lbindex/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 1 Erro ao recuperar registros da base %s'. FALHA: %s """ % ('log_lbindex', fail_content)) resp = {} for reg in registries: resp[reg['id_doc_orig']] = reg['dt_last_up_orig'] return resp def get_registries(self): """Retorna registros à serem indexados que sob certos critérios não tenham falhado no passado. """ # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor registries = [ ] if config.FORCE_INDEX: params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'} else: params = { '$$':'{"select":["id_doc", "dt_last_up"], \ "literal":"dt_idx is null", "limit": %d}' } params.update(result_count='false') params['$$'] = params['$$'] % config.DEFAULT_LIMIT url = config.REST_URL + '/' + self.base + '/doc' req = None try: req = requests.get(url, params=params) req.raise_for_status() response = req.json() registries = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao recuperar registros da base %s'. FALHA: %s """ % (self.base, fail_content)) ''' TODO: Essa lógica poderia ser mais eficiente... A princípio vejo duas soluções... 1 - Guardar em cache (mais complicada); 2 - Trazer apenas os registros (id_doc) envolvidos no processo de indexação atual. By Questor ''' ''' TODO: Esse método "self.get_passed_registries()" deveria ser chamado sempre? Mesmo quando a operação é "create"? Checar melhor... By Questor ''' # NOTE: Obtêm registros da base de log de erros! Registros # q tenham falhado no passado! By Questor passed = self.get_passed_registries() _registries = [ ] for reg in registries: if reg['_metadata']['id_doc'] in passed: ''' NOTE: O objetivo aqui é checar se o registro está no log de erros (registros que tentou-se indexar no passado) e se estiver ignora-os a não ser que a data de "update" do registro registrado na base de logs seja diferente da data atual do registro, nesses casos o LBIndex vai tentar novamente! By Questor ''' ''' NOTE: No dict "passed" consta apenas o valor do campo "dt_last_up_orig" da base "log_lbindex"! By Questor ''' dt_last_up = passed[reg['_metadata']['id_doc']] if dt_last_up != reg['_metadata']['dt_last_up']: _registries.append(reg) else: _registries.append(reg) return _registries def get_full_reg(self, id, dt_last_up): """Obtêm o registro doc mais textos extraídos dos arquivos anexos se houverem. """ # TODO: Registrar essa ação no log toda "santa vez"? By Questor logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base)) response = None url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full' req = None try: req = requests.get(url) req.raise_for_status() response = req.json() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao recuperar registro %s na base %s'. FALHA: %s """ % (str(id), self.base, fail_content) # TODO: Pq duas chamadas as logs? By Questor logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return response def es_create_mapping(self): """Cria um mapping p/ uma base se houver configuração p/ isso.""" response_0 = None response_0_json = None index_url = None try: index_url = ("http://" + self.idx_exp_host + "/" + self.idx_exp_index + "/" + self.idx_exp_type) response_0 = requests.get(index_url + "/_mapping") response_0.raise_for_status() response_0_json = response_0.json() except requests.exceptions.HTTPError as e: # NOTE: Normalmente entrará nesse bloco de código # quando o índice não existe! By Questor self.es_create_index() except requests.exceptions.RequestException as e: raise Exception("Problem in the mapping provider! " + str(e)) except Exception as e: raise Exception("Mapping operation. Program error! " + str(e)) if (response_0.status_code == 200 and not response_0_json and (self.txt_mapping is not None and self.txt_mapping)): response_1 = None try: response_1 = self.es.put_mapping( index=self.idx_exp_index, doc_type=self.idx_exp_type, mapping=self.txt_mapping) if (response_1 is None or response_1.get("acknowledged", None) is None or response_1.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar mapping! " + str(response_1)) except Exception as e: raise Exception("Mapping creation error! " + str(e)) def es_create_index(self): """Criar um índice p/ a base com as configurações setadas, não havendo criar um índice genérico. """ response_0 = None try: cfg_idx_holder = None # NOTE: Se não houver configuração de indexação "setada" # o sistema vai criar uma padrão! By Questor if self.cfg_idx is not None and self.cfg_idx: cfg_idx_holder = self.cfg_idx else: cfg_idx_holder = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } response_0 = self.es.create_index(index=self.idx_exp_index, settings=cfg_idx_holder) if (response_0 is None or response_0.get("acknowledged", None) is None or response_0.get("acknowledged", None) != True): raise Exception("Retorno inesperado do servidor \ ao criar index! " + str(response_0)) self.es_create_mapping() except IndexAlreadyExistsError as e: self.es_create_mapping() except Exception as e: raise Exception("Index creation error! " + str(e)) def index_member(self, registry, id, dt_last_up): """Criar o índice textual para cada registro.""" logger.info( 'Indexando registro %s da base %s na url %s ...' % ( str(id), self.base, self.idx_exp_url)) try: # NOTE: Trata e cria os mappings e index textuais! # By Questor self.es_create_mapping() self.es.index(self.idx_exp_index, self.idx_exp_type, registry, id=id) return True except Exception as e: error_msg = ("Erro ao indexar registro %s da base %s na url %s'. " "Mensagem de erro: %s") % ( str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) # TODO: Pq dois logs? By Questor self.write_error(id, dt_last_up, error_msg) return False def update_dt_index(self, id, dt_last_up): """Atualizar a data de atualização da indexação textual do registro.""" logger.info('Alterando data de indexacao do '\ 'registro %s da base %s ...' % (str(id), self.base)) params = {'value': datetime.datetime.now().\ strftime('%d/%m/%Y %H:%M:%S')} url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx') req = None try: req = requests.put(url, params=params) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = 'Erro ao alterar data de indexacao do registro %s na '\ 'base %s. FALHA: %s' % (str(id), self.base, fail_content) logger.error(error_msg) self.write_error(id, dt_last_up, error_msg) return False def write_error(self, id_doc, dt_last_up, error_msg): """Write errors to LightBase.""" error = { 'nm_base': self.base, 'id_doc_orig': id_doc, 'error_msg': error_msg, 'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'dt_last_up_orig': dt_last_up } url = config.REST_URL + '/log_lbindex/doc' data = {'value': json.dumps(error)} req = None try: req = requests.post(url, data=data) req.raise_for_status() except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" 0 Erro ao tentar escrever erro no Lightbase. FALHA: %s """ % fail_content) def get_errors(self): """Get all bases which has to index registries.""" errors = [ ] params = """{ "literal": "base = '%s'", "limit": 250 }""" % (self.base) url = config.REST_URL + '/_index_error' req = None try: req = requests.get(url, params={'$$':params}) req.raise_for_status() response = req.json() errors = response["results"] except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) logger.error(""" Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s """ % (url, fail_content)) return errors # TODO: Esse método serve para criar/atualizar p/ uma # indexação (index) padrão! No momento está "desvirtuado", # pois basta apagar o índice p/ q ele seja recriado com a # indexação setada na rota "_txt_idx"! Creio que esse # método não faz muito sentido aqui. Sugiro remover! # By Questor def create_index(self): """Cria índice com as opções de mapeamento padrão Atualiza o índice se já estiver criado. """ settings = { "settings":{ "analysis":{ "analyzer":{ "default":{ "tokenizer":"standard", "filter":[ "lowercase", "asciifolding" ] } } } } } http, space, address, _index, _type = self.idx_exp_url.split('/') try: result = self.es.create_index( index=_index, settings=settings ) except IndexAlreadyExistsError as e: logger.info("O índice já existe. Tentando atualizar o mapping...") self.es.close_index(index=_index) result = self.es.update_settings( index=_index, settings=settings ) logger.info("Mapping atualizado com sucesso. Abrindo o índice...") self.es.open_index(index=_index) logger.info("Índice reaberto com sucesso!") def delete_index(self, registry): """Deletar registros no index.""" id = registry['id_doc'] try: http, space, address, _index, _type = self.idx_exp_url.split('/') self.es.delete(_index, _type, id=id) return True except ElasticHttpNotFoundError as e: return True except Exception as e: error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\ 'Mensagem de erro: %s' % \ (str(id), self.base, self.idx_exp_url, str(e)) logger.error(error_msg) return False def delete_error(self, registry): """Deletar registro de erros na rota '_index_error'.""" url = (config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""") url = url % (registry['base'], registry['id_doc']) logger.info('Deletando registro de erro de indice na url %s' % url) req = None try: req = requests.delete(url) req.raise_for_status() return True except Exception as e: fail_content = None if req is not None: fail_content = req._content else: fail_content = str(e) error_msg = """ Erro ao deletar erro de indice. FALHA: %s """ % (fail_content) logger.error(error_msg) return False @staticmethod def create_log_base(): """Cria base de log do LBIndex caso não exista.""" log_base = model.LogBase() response = log_base.get_base() if not response: # NOTE: Cria a base já que ela não existe! logger.info("Criando base de log do índice...") result = log_base.create_base() if result is None: logger.error("Erro na criação da base de log: \n%s", response.text) return False else: logger.info("Base de log criada com sucesso!") return True