def backup(component, backup_root):
    """Recurse over all indices at the ElasticSearch URL and backup indices."""

    # get ES object
    es = get_mozart_es() if component == 'mozart' else get_grq_es()

    # save backup root
    if os.path.isdir(backup_root):
        saved = backup_root + ".bak"
        if os.path.isdir(saved):
            shutil.rmtree(saved)
        shutil.move(backup_root, saved)
    os.makedirs(backup_root)

    # get all indices
    c = elasticsearch.client.IndicesClient(es.es)
    indices = sorted(c.get_alias().keys())

    # loop over each index and save settings, mapping, and docs
    for idx in indices:
        if idx == "geonames":
            continue
        print("Backup up %s..." % idx)
        d = os.path.join(backup_root, idx)
        if not os.path.isdir(d):
            os.makedirs(d)

        # save settings
        settings = c.get_settings(idx)
        s = os.path.join(d, "%s.settings" % idx)
        with open(s, "w") as f:
            json.dump(settings, f, indent=2, sort_keys=True)
        print("Backed up settings for %s" % idx)

        # save mapping
        mapping = c.get_mapping(idx)
        m = os.path.join(d, "%s.mapping" % idx)
        with open(m, "w") as f:
            json.dump(mapping, f, indent=2, sort_keys=True)
        print("Backed up mapping for %s" % idx)

        # save docs
        query = {"query": {"match_all": {}}}
        txt = os.path.join(d, "%s.docs" % idx)
        with open(txt, "w") as f:
            for hit in es.query(body=query, index=idx):
                f.write("%s\n" % json.dumps(hit["_source"]))
        # b = os.path.join(d, '%s.docs.bz2' % idx)
        # with bz2.BZ2File(b, 'w') as f:
        #    for doc in docs:
        #        f.write("%s\n" % json.dumps(doc))
        print("Backed up docs for %s" % idx)
Exemple #2
0
    def __init__(self, context, pge_config, settings, job_result, mozart_es=None, grq_es=None):
        self._context = context
        self._pge_config = pge_config
        self._settings = settings
        self._job_result = job_result
        self.accountability = Accountability(self._context, self._job_result.get(chimera_consts.WORK_DIR))
        if mozart_es:
            self._mozart_es = mozart_es
        else:
            self._mozart_es = get_mozart_es()

        if grq_es:
            self._grq_es = grq_es
        else:
            self._grq_es = get_grq_es()
def restore(component, backup_dir, id_key='id'):
    """Restore ES index from backup docs and mapping."""

    # get ES object
    es = get_mozart_es() if component == 'mozart' else get_grq_es()

    # get files
    idx = os.path.basename(backup_dir)
    docs_file = os.path.join(backup_dir, '%s.docs' % idx)
    if not os.path.isfile(docs_file):
        raise RuntimeError("Failed to find docs file %s" % docs_file)
    mapping_file = os.path.join(backup_dir, '%s.mapping' % idx)
    if not os.path.isfile(mapping_file):
        raise RuntimeError("Failed to find mapping file %s" % mapping_file)
    settings_file = os.path.join(backup_dir, '%s.settings' % idx)
    if not os.path.isfile(settings_file):
        raise RuntimeError("Failed to find settings file %s" % settings_file)

    # put mapping and settings
    with open(mapping_file) as f:
        mappings = json.load(f)[idx]['mappings']
    with open(settings_file) as f:
        settings = json.load(f)[idx]['settings']

    # create index
    c = elasticsearch.client.IndicesClient(es.es)
    c.create(idx,
             body={
                 'settings': settings,
                 'mappings': mappings
             },
             ignore=400)

    # import docs
    with open(docs_file) as f:
        for l in f:
            j = json.loads(l)
            es.index_document(index=idx, body=j, id=j[id_key])
Exemple #4
0
from io import StringIO
from lxml.etree import XMLParser, parse, tostring
from importlib import import_module
from celery.result import AsyncResult
from atomicwrites import atomic_write
from bisect import insort

import hysds
from hysds.log_utils import logger, log_prov_es, payload_hash_exists
from hysds.celery import app
from hysds.es_util import get_grq_es

import osaka.main

grq_es = get_grq_es()

# disk usage setting converter
DU_CALC = {"GB": 1024**3, "MB": 1024**2, "KB": 1024}


class NoDedupJobFoundException(Exception):
    def __init__(self, message):
        self.message = message
        super(NoDedupJobFoundException, self).__init__(message)


def get_module(m):
    """Import module and return."""

    try:
Exemple #5
0
def aws_get_script(dataset=None):
    """Return AWS get script."""
    grq_es = get_grq_es()
    index = app.conf["DATASET_ALIAS"]
    logger.debug("Dataset: {}".format(json.dumps(dataset, indent=2)))
    paged_result = grq_es.es.search(body=dataset,
                                    index=index,
                                    size=10,
                                    scroll="10m")
    logger.debug("Paged Result: {}".format(json.dumps(paged_result, indent=2)))

    scroll_ids = set()
    count = paged_result["hits"]["total"]["value"]
    scroll_id = paged_result["_scroll_id"]
    scroll_ids.add(scroll_id)

    # stream output a page at a time for better performance and lower memory footprint
    def stream_aws_get(scroll_id, paged_result):
        yield '#!/bin/bash\n#\n' + \
              '# query:\n#\n' + \
              '#%s#\n#\n#' % json.dumps(dataset) + \
              '# total datasets matched: %d\n\n' % count + \
              'echo ""\n'
        aws_get_cmd = 'aws s3 sync {} {}\n'

        while True:
            if len(paged_result['hits']['hits']) == 0:
                break
            # Elastic Search seems like it's returning duplicate urls. Remove duplicates
            unique_urls = []
            for hit in paged_result['hits']['hits']:
                [
                    unique_urls.append(url) for url in hit['_source']['urls']
                    if url not in unique_urls and url.startswith("s3")
                ]

            for url in unique_urls:
                logging.debug("urls in unique urls: %s", url)
                parsed_url = urlparse(url)
                yield 'echo "downloading  %s"\n' % os.path.basename(
                    parsed_url.path)
                yield aws_get_cmd.format(
                    "{}://{}".format(
                        parsed_url.scheme, parsed_url.path[1:] if
                        parsed_url.path.startswith('/') else parsed_url.path),
                    os.path.basename(parsed_url.path))
            paged_result = grq_es.es.scroll(scroll_id=scroll_id, scroll="10m")
            scroll_id = paged_result['_scroll_id']
            scroll_ids.add(scroll_id)

    # malarout: interate over each line of stream_aws_get response, and write to a file which is later attached to the
    # email.
    with open('aws_get_script.sh', 'w') as f:
        for i in stream_aws_get(scroll_id, paged_result):
            f.write(i)

    for sid in scroll_ids:
        grq_es.es.clear_scroll(scroll_id=sid)

    # for gzip compressed use file extension .tar.gz and modifier "w:gz"
    os.rename('aws_get_script.sh', 'aws_get_script.bash')
    tar = tarfile.open("aws_get.tar.gz", "w:gz")
    tar.add('aws_get_script.bash')
    tar.close()
Exemple #6
0
def purge_products(query, component, operation):
    """
    Iterator used to iterate across a query result and submit jobs for every hit
    :param query: query to post to ElasticSearch and whose result will be iterated, JSON sting enc
    :param component: tosca || figaro
    :param operation: purge or something else
    """
    logger.debug("action: %s for %s", operation, component)
    logger.debug("query: %s" % json.dumps(query, indent=2))

    if component == "mozart" or component == "figaro":
        es = get_mozart_es()
        es_index = app.conf["STATUS_ALIAS"]
    else:  # "tosca"
        es = get_grq_es()
        es_index = app.conf["DATASET_ALIAS"]

    results = es.query(index=es_index, body=query)  # Querying for products

    if component == 'tosca':
        for result in results:
            ident = result["_id"]
            index = result["_index"]

            # find the Best URL first
            best = None
            for url in result["_source"]["urls"]:
                if not url.startswith("http"):
                    best = url

            print('paramater being passed to osaka.main.rmall: ',
                  best)  # making osaka call to delete product
            if best is not None:
                osaka.main.rmall(best)

            es.delete_by_id(index=index, id=ident,
                            ignore=404)  # removing the metadata
            logger.info('Purged %s' % ident)

    else:
        purge = True if operation == 'purge' else False  # purge job from index

        for result in results:
            uuid = result["_source"]['uuid']
            payload_id = result["_source"]['payload_id']
            index = result["_index"]

            # Always grab latest state (not state from query result)
            task = app.AsyncResult(uuid)
            state = task.state  # Active states may only revoke
            logger.info("\nJob state: %s\n", state)

            if state in ["RETRY", "STARTED"] or (state == "PENDING"
                                                 and not purge):
                if not purge:
                    logger.info('Revoking %s\n', uuid)
                    revoke(uuid, state)
                else:
                    logger.info('Cannot remove active job %s\n', uuid)
                continue
            elif not purge:
                logger.info('Cannot stop inactive job: %s\n', uuid)
                continue

            # Safety net to revoke job if in PENDING state
            if state == "PENDING":
                logger.info('Revoking %s\n', uuid)
                revoke(uuid, state)

            # Both associated task and job from ES
            logger.info('Removing document from index %s for %s', index,
                        payload_id)
            es.delete_by_id(index=index, id=payload_id, ignore=404)
            logger.info('Removed %s from index: %s', payload_id, index)
        logger.info('Finished.')
Exemple #7
0
def wget_script(dataset=None, glob_dict=None):
    """Return wget script."""

    # query
    """Return AWS get script."""
    grq_es = get_grq_es()
    index = app.conf["DATASET_ALIAS"]
    logger.debug("Dataset: {}".format(json.dumps(dataset, indent=2)))
    paged_result = grq_es.es.search(body=dataset, index=index, size=100, scroll="10m")
    logger.debug("Paged Result: {}".format(json.dumps(paged_result, indent=2)))

    scroll_ids = set()
    count = paged_result["hits"]["total"]["value"]
    scroll_id = paged_result["_scroll_id"]
    scroll_ids.add(scroll_id)

    # stream output a page at a time for better performance and lower memory footprint
    def stream_wget(scroll_id, paged_result, glob_dict=None):
        yield '#!/bin/bash\n#\n' + \
              '# query:\n#\n' + \
              '%s#\n#\n#' % json.dumps(dataset) + \
              '# total datasets matched: %d\n\n' % count + \
              'read -s -p "JPL Username: "******""\n' + \
              'read -s -p "JPL Password: "******""\n'
        wget_cmd = 'wget --no-check-certificate --mirror -np -nH --reject "index.html*"'
        wget_cmd_password = wget_cmd + ' --user=$user --password=$password'

        while True:
            if len(paged_result['hits']['hits']) == 0:
                break
            # Elastic Search seems like it's returning duplicate urls. Remove duplicates
            unique_urls = []
            for hit in paged_result['hits']['hits']:
                [unique_urls.append(url) for url in hit['_source']['urls']
                 if url not in unique_urls and url.startswith("http")]

            for url in unique_urls:
                logging.debug("urls in unique urls: %s", url)
                if '.s3-website' in url or 'amazonaws.com' in url:
                    parsed_url = urlparse(url)
                    cut_dirs = len(parsed_url.path[1:].split('/')) - 1
                else:
                    if 's1a_ifg' in url:
                        cut_dirs = 3
                    else:
                        cut_dirs = 6
                if '.s3-website' in url or 'amazonaws.com' in url:
                    files = get_s3_files(url)
                    if glob_dict:
                        files = glob_filter(files, glob_dict)
                    for file in files:
                        yield 'echo "downloading  %s"\n' % file
                        if 's1a_ifg' in url:
                            yield "%s --cut-dirs=%d %s\n" % (wget_cmd, cut_dirs, file)
                        else:
                            yield "%s --cut-dirs=%d %s\n" % (wget_cmd, cut_dirs, file)
                if 'aria2-dav.jpl.nasa.gov' in url:
                    yield 'echo "downloading  %s"\n' % url
                    yield "%s --cut-dirs=%d %s/\n" % (wget_cmd_password, (cut_dirs+1), url)
                if 'aria-csk-dav.jpl.nasa.gov' in url:
                    yield 'echo "downloading  %s"\n' % url
                    yield "%s --cut-dirs=%d %s/\n" % (wget_cmd_password, (cut_dirs+1), url)
                if 'aria-dst-dav.jpl.nasa.gov' in url:
                    yield 'echo "downloading  %s"\n' % url
                    yield "%s --cut-dirs=%d %s/\n" % (wget_cmd, cut_dirs, url)
                    break

            paged_result = grq_es.es.scroll(scroll_id=scroll_id, scroll="10m")
            logger.debug("paged result: {}".format(json.dumps(paged_result, indent=2)))
            scroll_id = paged_result['_scroll_id']
            scroll_ids.add(scroll_id)

    # malarout: interate over each line of stream_wget response, and write to a file which is later attached to the
    # email.
    with open('wget_script.sh', 'w') as f:
        for i in stream_wget(scroll_id, paged_result, glob_dict):
            f.write(i)

    for sid in scroll_ids:
        grq_es.es.clear_scroll(scroll_id=sid)