def monitor(self, deploy): # get the current recovery status indexer = rpc.getThriftIndexerClient(deploy.worker.lan_dns, int(deploy.base_port), 10000) indexer_status = indexer.getStatus() # complain only if an error arised return indexer_status != IndexerStatus.error
def monitor(self, deploy): deploy.index # so that it fails if the index foreign key is broken try: client = rpc.getThriftIndexerClient(deploy.worker.lan_dns, int(deploy.base_port), 5000) client.ping() return True except Exception: self.logger.exception("Failed to ping deploy %s for index %s", deploy.id, deploy.index.code) self.err_msg = self.describe_error() return False
def _handle_recovering(self, deploy): logger.debug('Contacting %s (%s) on %d of %s to check if it finished recovering', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns) try: indexer = rpc.getThriftIndexerClient(deploy.worker.lan_dns, int(deploy.base_port), timeout_ms) indexer_status = indexer.getStatus() if indexer_status == IndexerStatus.started: logger.info('Requesting full recovery for deploy for %s (%s) on %d.', deploy.index.name, deploy.index.code, deploy.base_port) indexer.startFullRecovery() return DeployManager.INDEX_RECOVERING elif indexer_status == IndexerStatus.recovering: logger.info("Index %s is in state %s. Waiting untill it's ready", deploy.index.code, indexer_status) return DeployManager.INDEX_RECOVERING elif indexer_status == IndexerStatus.ready: deploy.update_status(Deploy.States.controllable) if deploy.index.status == Index.States.waking_up: deploy.index.update_status(Index.States.live) mail.report_new_deploy(deploy) logger.info('Deploy for %s (%s) on %d of %s reports it has finished recovering. New state moved to %s.', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns, Deploy.States.controllable) # The following is a HACK to restore twitvid's promotes after its index was moved # because we don't record promotes and they are lost after each move. # Luckily we know what twitvid promotes, so we can reproduce it here. This will break if they # change their code and start promoting something else. GitHub issue #41 calls for a proper # implementaion or to remove the feature altogether. Twitvid could now do this by using the # caret operator like this: "name:(q)^100 OR author:(q)^100 OR (q)". if deploy.index.code == 'd7fz1': try: searcher = rpc.getThriftSearcherClient(deploy.worker.lan_dns, int(deploy.base_port), timeout_ms) start = 0 while True: rs = searcher.search('verified:1 AND cont_type:user', start, 1000, 0, {}, {}, {}, {}, {'fetch_fields':'author,fullname,name'}) if len(rs.docs) == 0: break for d in rs.docs: author = d.get('author') if author: indexer.promoteResult(d['docid'], author.lower().strip()) name = d.get('name', d.get('fullname')) if name: indexer.promoteResult(d['docid'], name.lower().strip()) start += len(rs.docs) logger.info('WARNING: HACK! %s promotes were recovered for TwitVid.', start) except Exception, e: logger.error('HACK ERROR: applying TwitVid promotes', e) # Phew. End of HACK. Please let's not do this anymore. return DeployManager.INDEX_CONTROLLABLE elif indexer_status == IndexerStatus.error: logger.error('Deploy for %s (%s) on %d of %s reports it has failed recovering. MANUAL INTERVENTION REQUIRED.', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns)
def _handle_initializing(self, deploy): logger.debug('Trying to reach %s (%s) on %d of %s.', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns) try: indexer = rpc.getThriftIndexerClient(deploy.worker.lan_dns, int(deploy.base_port), timeout_ms) indexer.ping() # successfully reported stats() if deploy.index.status == Index.States.new: deploy.update_status(Deploy.States.controllable) index = deploy.index index.status = Index.States.live index.save() logger.info('Deploy for %s (%s) on %d of %s contacted. New status moved to %s.', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns, Deploy.States.controllable) return DeployManager.INDEX_CONTROLLABLE else: deploy.update_status(Deploy.States.recovering) logger.info('Deploy for %s (%s) on %d of %s contacted. New status moved to %s.', deploy.index.name, deploy.index.code, deploy.base_port, deploy.worker.wan_dns, Deploy.States.recovering) return DeployManager.INDEX_RECOVERING except Exception, e: # not ready yet, we'll leave it as initializing logger.info('Index %s unreachable: %s', deploy.index.code, e) return DeployManager.INDEX_INITIALIZING
def operations(request): #if request.method == 'POST': # if request.POST['task'] == 'redeploy': # id = request.POST['index_id'] # rpc.get_deploy_manager().redeploy_index(Index.objects.get(pk=id).code) # return HttpResponseRedirect('/resource_map') level = request.GET.get('level', 'top') if level == 'top': return render_to_response('operations/index.html', Context({}, request)) elif level == 'refresh': data = { 'Config': map(configuration_dict, IndexConfiguration.objects.all()), 'Account': map(account_dict, Account.objects.select_related('user').all()), 'Deploy': map(deploy_dict, Deploy.objects.all()), 'Index': map(index_dict, Index.objects.all()), 'Package': map(package_dict, Package.objects.all()), 'Worker': map(worker_dict, Worker.objects.all()), } return JsonResponse(data) elif level == 'index': id = request.GET.get('id') index = Index.objects.get(pk=id); data = { 'Index': index_dict(index), 'Deploy': map(deploy_dict, index.deploys.all()), } return JsonResponse(data) elif level == 'stats': id = request.GET.get('id') d = Deploy.objects.get(pk=id) client = rpc.getThriftIndexerClient(d.worker.lan_dns, int(d.base_port), 3000) return JsonResponse(client.get_stats()) elif level == 'log': id = request.GET.get('id') file = request.GET.get('file') d = Deploy.objects.get(pk=id) client = rpc.get_worker_controller(d.worker, 4000) lines = client.tail(file, 300, d.index.code, d.base_port) return JsonResponse(lines) elif level == 'redeploy': id = request.GET.get('id') rpc.get_deploy_manager().redeploy_index(Index.objects.get(pk=id).code) return HttpResponse() elif level == 'decommission': id = request.GET.get('id') Worker.objects.filter(id=id).update(status=Worker.States.decommissioning) return JsonResponse(worker_dict(Worker.objects.get(id=id))) elif level == 'delete_worker': id = request.GET.get('id') w = Worker.objects.get(id=id) if w.status != Worker.States.decommissioning: return HttpResponse('worker not decommissioning', status=409) if w.deploys.count(): return HttpResponse('worker not empty', status=409) w.delete() return HttpResponse() elif level == 'delete_account': id = request.GET.get('id') a = Account.objects.get(id=id) user = a.user.user if a.indexes.count(): return HttpResponse('account has index', status=409) if a.payment_informations.count(): return HttpResponse('account has payment information', status=409) user = a.user.user a.delete() user.delete() return HttpResponse() elif level == 'account_set_pkg': id = request.GET.get('id') pid = request.GET.get('pkg') p = Package.objects.get(id=pid) updated = Account.objects.filter(id=id).update(package=p) if updated: return JsonResponse(account_dict(Account.objects.get(id=id))) else: return HttpResponse('account not found', status=409) elif level == 'account_set_cfg': id = request.GET.get('id') cid = request.GET.get('cfg') c = IndexConfiguration.objects.get(id=cid) updated = Account.objects.filter(id=id).update(configuration=c) if updated: return JsonResponse(account_dict(Account.objects.get(id=id))) else: return HttpResponse('account not found', status=409) elif level == 'index_set_cfg': id = request.GET.get('id') cid = request.GET.get('cfg') c = IndexConfiguration.objects.get(id=cid) updated = Index.objects.filter(id=id).update(configuration=c) if updated: return JsonResponse(index_dict(Index.objects.get(id=id))) else: return HttpResponse('index not found', status=409) return HttpResponseNotFound()
def operations(request): #if request.method == 'POST': # if request.POST['task'] == 'redeploy': # id = request.POST['index_id'] # rpc.get_deploy_manager().redeploy_index(Index.objects.get(pk=id).code) # return HttpResponseRedirect('/resource_map') level = request.GET.get('level', 'top') if level == 'top': return render_to_response('operations/index.html', Context({}, request)) elif level == 'refresh': data = { 'Config': map(configuration_dict, IndexConfiguration.objects.all()), 'Account': map(account_dict, Account.objects.select_related('user').all()), 'Deploy': map(deploy_dict, Deploy.objects.all()), 'Index': map(index_dict, Index.objects.all()), 'Package': map(package_dict, Package.objects.all()), 'Worker': map(worker_dict, Worker.objects.all()), } return JsonResponse(data) elif level == 'index': id = request.GET.get('id') index = Index.objects.get(pk=id) data = { 'Index': index_dict(index), 'Deploy': map(deploy_dict, index.deploys.all()), } return JsonResponse(data) elif level == 'stats': id = request.GET.get('id') d = Deploy.objects.get(pk=id) client = rpc.getThriftIndexerClient(d.worker.lan_dns, int(d.base_port), 3000) return JsonResponse(client.get_stats()) elif level == 'log': id = request.GET.get('id') file = request.GET.get('file') d = Deploy.objects.get(pk=id) client = rpc.get_worker_controller(d.worker, 4000) lines = client.tail(file, 300, d.index.code, d.base_port) return JsonResponse(lines) elif level == 'redeploy': id = request.GET.get('id') rpc.get_deploy_manager().redeploy_index(Index.objects.get(pk=id).code) return HttpResponse() elif level == 'decommission': id = request.GET.get('id') Worker.objects.filter(id=id).update( status=Worker.States.decommissioning) return JsonResponse(worker_dict(Worker.objects.get(id=id))) elif level == 'delete_worker': id = request.GET.get('id') w = Worker.objects.get(id=id) if w.status != Worker.States.decommissioning: return HttpResponse('worker not decommissioning', status=409) if w.deploys.count(): return HttpResponse('worker not empty', status=409) w.delete() return HttpResponse() elif level == 'delete_account': id = request.GET.get('id') a = Account.objects.get(id=id) user = a.user.user if a.indexes.count(): return HttpResponse('account has index', status=409) if a.payment_informations.count(): return HttpResponse('account has payment information', status=409) user = a.user.user a.delete() user.delete() return HttpResponse() elif level == 'account_set_pkg': id = request.GET.get('id') pid = request.GET.get('pkg') p = Package.objects.get(id=pid) updated = Account.objects.filter(id=id).update(package=p) if updated: return JsonResponse(account_dict(Account.objects.get(id=id))) else: return HttpResponse('account not found', status=409) elif level == 'account_set_cfg': id = request.GET.get('id') cid = request.GET.get('cfg') c = IndexConfiguration.objects.get(id=cid) updated = Account.objects.filter(id=id).update(configuration=c) if updated: return JsonResponse(account_dict(Account.objects.get(id=id))) else: return HttpResponse('account not found', status=409) elif level == 'index_set_cfg': id = request.GET.get('id') cid = request.GET.get('cfg') c = IndexConfiguration.objects.get(id=cid) updated = Index.objects.filter(id=id).update(configuration=c) if updated: return JsonResponse(index_dict(Index.objects.get(id=id))) else: return HttpResponse('index not found', status=409) return HttpResponseNotFound()