def backup(component, backup_root): """Recurse over all indices at the ElasticSearch URL and backup indices.""" # get ES object es = get_mozart_es() if component == 'mozart' else get_grq_es() # save backup root if os.path.isdir(backup_root): saved = backup_root + ".bak" if os.path.isdir(saved): shutil.rmtree(saved) shutil.move(backup_root, saved) os.makedirs(backup_root) # get all indices c = elasticsearch.client.IndicesClient(es.es) indices = sorted(c.get_alias().keys()) # loop over each index and save settings, mapping, and docs for idx in indices: if idx == "geonames": continue print("Backup up %s..." % idx) d = os.path.join(backup_root, idx) if not os.path.isdir(d): os.makedirs(d) # save settings settings = c.get_settings(idx) s = os.path.join(d, "%s.settings" % idx) with open(s, "w") as f: json.dump(settings, f, indent=2, sort_keys=True) print("Backed up settings for %s" % idx) # save mapping mapping = c.get_mapping(idx) m = os.path.join(d, "%s.mapping" % idx) with open(m, "w") as f: json.dump(mapping, f, indent=2, sort_keys=True) print("Backed up mapping for %s" % idx) # save docs query = {"query": {"match_all": {}}} txt = os.path.join(d, "%s.docs" % idx) with open(txt, "w") as f: for hit in es.query(body=query, index=idx): f.write("%s\n" % json.dumps(hit["_source"])) # b = os.path.join(d, '%s.docs.bz2' % idx) # with bz2.BZ2File(b, 'w') as f: # for doc in docs: # f.write("%s\n" % json.dumps(doc)) print("Backed up docs for %s" % idx)
def __init__(self, context, pge_config, settings, job_result, mozart_es=None, grq_es=None): self._context = context self._pge_config = pge_config self._settings = settings self._job_result = job_result self.accountability = Accountability(self._context, self._job_result.get(chimera_consts.WORK_DIR)) if mozart_es: self._mozart_es = mozart_es else: self._mozart_es = get_mozart_es() if grq_es: self._grq_es = grq_es else: self._grq_es = get_grq_es()
def restore(component, backup_dir, id_key='id'): """Restore ES index from backup docs and mapping.""" # get ES object es = get_mozart_es() if component == 'mozart' else get_grq_es() # get files idx = os.path.basename(backup_dir) docs_file = os.path.join(backup_dir, '%s.docs' % idx) if not os.path.isfile(docs_file): raise RuntimeError("Failed to find docs file %s" % docs_file) mapping_file = os.path.join(backup_dir, '%s.mapping' % idx) if not os.path.isfile(mapping_file): raise RuntimeError("Failed to find mapping file %s" % mapping_file) settings_file = os.path.join(backup_dir, '%s.settings' % idx) if not os.path.isfile(settings_file): raise RuntimeError("Failed to find settings file %s" % settings_file) # put mapping and settings with open(mapping_file) as f: mappings = json.load(f)[idx]['mappings'] with open(settings_file) as f: settings = json.load(f)[idx]['settings'] # create index c = elasticsearch.client.IndicesClient(es.es) c.create(idx, body={ 'settings': settings, 'mappings': mappings }, ignore=400) # import docs with open(docs_file) as f: for l in f: j = json.loads(l) es.index_document(index=idx, body=j, id=j[id_key])
from io import StringIO from lxml.etree import XMLParser, parse, tostring from importlib import import_module from celery.result import AsyncResult from atomicwrites import atomic_write from bisect import insort import hysds from hysds.log_utils import logger, log_prov_es, payload_hash_exists from hysds.celery import app from hysds.es_util import get_grq_es import osaka.main grq_es = get_grq_es() # disk usage setting converter DU_CALC = {"GB": 1024**3, "MB": 1024**2, "KB": 1024} class NoDedupJobFoundException(Exception): def __init__(self, message): self.message = message super(NoDedupJobFoundException, self).__init__(message) def get_module(m): """Import module and return.""" try:
def aws_get_script(dataset=None): """Return AWS get script.""" grq_es = get_grq_es() index = app.conf["DATASET_ALIAS"] logger.debug("Dataset: {}".format(json.dumps(dataset, indent=2))) paged_result = grq_es.es.search(body=dataset, index=index, size=10, scroll="10m") logger.debug("Paged Result: {}".format(json.dumps(paged_result, indent=2))) scroll_ids = set() count = paged_result["hits"]["total"]["value"] scroll_id = paged_result["_scroll_id"] scroll_ids.add(scroll_id) # stream output a page at a time for better performance and lower memory footprint def stream_aws_get(scroll_id, paged_result): yield '#!/bin/bash\n#\n' + \ '# query:\n#\n' + \ '#%s#\n#\n#' % json.dumps(dataset) + \ '# total datasets matched: %d\n\n' % count + \ 'echo ""\n' aws_get_cmd = 'aws s3 sync {} {}\n' while True: if len(paged_result['hits']['hits']) == 0: break # Elastic Search seems like it's returning duplicate urls. Remove duplicates unique_urls = [] for hit in paged_result['hits']['hits']: [ unique_urls.append(url) for url in hit['_source']['urls'] if url not in unique_urls and url.startswith("s3") ] for url in unique_urls: logging.debug("urls in unique urls: %s", url) parsed_url = urlparse(url) yield 'echo "downloading %s"\n' % os.path.basename( parsed_url.path) yield aws_get_cmd.format( "{}://{}".format( parsed_url.scheme, parsed_url.path[1:] if parsed_url.path.startswith('/') else parsed_url.path), os.path.basename(parsed_url.path)) paged_result = grq_es.es.scroll(scroll_id=scroll_id, scroll="10m") scroll_id = paged_result['_scroll_id'] scroll_ids.add(scroll_id) # malarout: interate over each line of stream_aws_get response, and write to a file which is later attached to the # email. with open('aws_get_script.sh', 'w') as f: for i in stream_aws_get(scroll_id, paged_result): f.write(i) for sid in scroll_ids: grq_es.es.clear_scroll(scroll_id=sid) # for gzip compressed use file extension .tar.gz and modifier "w:gz" os.rename('aws_get_script.sh', 'aws_get_script.bash') tar = tarfile.open("aws_get.tar.gz", "w:gz") tar.add('aws_get_script.bash') tar.close()
def purge_products(query, component, operation): """ Iterator used to iterate across a query result and submit jobs for every hit :param query: query to post to ElasticSearch and whose result will be iterated, JSON sting enc :param component: tosca || figaro :param operation: purge or something else """ logger.debug("action: %s for %s", operation, component) logger.debug("query: %s" % json.dumps(query, indent=2)) if component == "mozart" or component == "figaro": es = get_mozart_es() es_index = app.conf["STATUS_ALIAS"] else: # "tosca" es = get_grq_es() es_index = app.conf["DATASET_ALIAS"] results = es.query(index=es_index, body=query) # Querying for products if component == 'tosca': for result in results: ident = result["_id"] index = result["_index"] # find the Best URL first best = None for url in result["_source"]["urls"]: if not url.startswith("http"): best = url print('paramater being passed to osaka.main.rmall: ', best) # making osaka call to delete product if best is not None: osaka.main.rmall(best) es.delete_by_id(index=index, id=ident, ignore=404) # removing the metadata logger.info('Purged %s' % ident) else: purge = True if operation == 'purge' else False # purge job from index for result in results: uuid = result["_source"]['uuid'] payload_id = result["_source"]['payload_id'] index = result["_index"] # Always grab latest state (not state from query result) task = app.AsyncResult(uuid) state = task.state # Active states may only revoke logger.info("\nJob state: %s\n", state) if state in ["RETRY", "STARTED"] or (state == "PENDING" and not purge): if not purge: logger.info('Revoking %s\n', uuid) revoke(uuid, state) else: logger.info('Cannot remove active job %s\n', uuid) continue elif not purge: logger.info('Cannot stop inactive job: %s\n', uuid) continue # Safety net to revoke job if in PENDING state if state == "PENDING": logger.info('Revoking %s\n', uuid) revoke(uuid, state) # Both associated task and job from ES logger.info('Removing document from index %s for %s', index, payload_id) es.delete_by_id(index=index, id=payload_id, ignore=404) logger.info('Removed %s from index: %s', payload_id, index) logger.info('Finished.')
def wget_script(dataset=None, glob_dict=None): """Return wget script.""" # query """Return AWS get script.""" grq_es = get_grq_es() index = app.conf["DATASET_ALIAS"] logger.debug("Dataset: {}".format(json.dumps(dataset, indent=2))) paged_result = grq_es.es.search(body=dataset, index=index, size=100, scroll="10m") logger.debug("Paged Result: {}".format(json.dumps(paged_result, indent=2))) scroll_ids = set() count = paged_result["hits"]["total"]["value"] scroll_id = paged_result["_scroll_id"] scroll_ids.add(scroll_id) # stream output a page at a time for better performance and lower memory footprint def stream_wget(scroll_id, paged_result, glob_dict=None): yield '#!/bin/bash\n#\n' + \ '# query:\n#\n' + \ '%s#\n#\n#' % json.dumps(dataset) + \ '# total datasets matched: %d\n\n' % count + \ 'read -s -p "JPL Username: "******""\n' + \ 'read -s -p "JPL Password: "******""\n' wget_cmd = 'wget --no-check-certificate --mirror -np -nH --reject "index.html*"' wget_cmd_password = wget_cmd + ' --user=$user --password=$password' while True: if len(paged_result['hits']['hits']) == 0: break # Elastic Search seems like it's returning duplicate urls. Remove duplicates unique_urls = [] for hit in paged_result['hits']['hits']: [unique_urls.append(url) for url in hit['_source']['urls'] if url not in unique_urls and url.startswith("http")] for url in unique_urls: logging.debug("urls in unique urls: %s", url) if '.s3-website' in url or 'amazonaws.com' in url: parsed_url = urlparse(url) cut_dirs = len(parsed_url.path[1:].split('/')) - 1 else: if 's1a_ifg' in url: cut_dirs = 3 else: cut_dirs = 6 if '.s3-website' in url or 'amazonaws.com' in url: files = get_s3_files(url) if glob_dict: files = glob_filter(files, glob_dict) for file in files: yield 'echo "downloading %s"\n' % file if 's1a_ifg' in url: yield "%s --cut-dirs=%d %s\n" % (wget_cmd, cut_dirs, file) else: yield "%s --cut-dirs=%d %s\n" % (wget_cmd, cut_dirs, file) if 'aria2-dav.jpl.nasa.gov' in url: yield 'echo "downloading %s"\n' % url yield "%s --cut-dirs=%d %s/\n" % (wget_cmd_password, (cut_dirs+1), url) if 'aria-csk-dav.jpl.nasa.gov' in url: yield 'echo "downloading %s"\n' % url yield "%s --cut-dirs=%d %s/\n" % (wget_cmd_password, (cut_dirs+1), url) if 'aria-dst-dav.jpl.nasa.gov' in url: yield 'echo "downloading %s"\n' % url yield "%s --cut-dirs=%d %s/\n" % (wget_cmd, cut_dirs, url) break paged_result = grq_es.es.scroll(scroll_id=scroll_id, scroll="10m") logger.debug("paged result: {}".format(json.dumps(paged_result, indent=2))) scroll_id = paged_result['_scroll_id'] scroll_ids.add(scroll_id) # malarout: interate over each line of stream_wget response, and write to a file which is later attached to the # email. with open('wget_script.sh', 'w') as f: for i in stream_wget(scroll_id, paged_result, glob_dict): f.write(i) for sid in scroll_ids: grq_es.es.clear_scroll(scroll_id=sid)