Ejemplo n.º 1
0
 def validate_mapping(self, mapping, env):
     idxkwargs = self[env]
     # just get the default indexer (target_name doesn't exist, return default one)
     idxklass = self.find_indexer(target_name="__placeholder_name__%s" %
                                  get_random_string())
     idxr_obj = idxklass(**idxkwargs)
     settings = idxr_obj.get_index_creation_settings()
     # generate a random index, it'll be deleted at the end
     index_name = ("hub_tmp_%s" % get_random_string()).lower()
     idxr = ESIndexer(index=index_name,
                      es_host=idxr_obj.host,
                      doc_type=None)
     self.logger.info(
         "Testing mapping by creating index '%s' on host '%s' (settings: %s)",
         index_name, idxr_obj.host, settings)
     try:
         res = idxr.create_index(mapping, settings)
         return res
     except Exception as e:
         self.logger.exception("create_index failed")
         raise e
     finally:
         try:
             idxr.delete_index()
         except Exception:
             pass
Ejemplo n.º 2
0
 def make_temp_collection(self):
     '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.'''
     if self.temp_collection_name:
         # already set
         return
     new_collection = None
     self.temp_collection_name = self.collection_name + '_temp_' + get_random_string()
     return self.temp_collection_name
Ejemplo n.º 3
0
    def make_temp_collection(self):
        '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.'''

        new_collection = None
        while 1:
            new_collection = self.__collection__ + '_temp_' + get_random_string()
            if new_collection not in self.db.collection_names():
                break
        self.temp_collection = self.db[new_collection]
        return new_collection
Ejemplo n.º 4
0
    def make_temp_collection(self):
        '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.'''

        new_collection = None
        while 1:
            new_collection = self.__collection__ + '_temp_' + get_random_string(
            )
            if new_collection not in self.db.collection_names():
                break
        self.temp_collection = self.db[new_collection]
        return new_collection
Ejemplo n.º 5
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([
                 self.__collection__, 'archive',
                 get_timestamp(),
                 get_random_string()
             ])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
Ejemplo n.º 6
0
 def _validate_mapping():
     client = AsyncElasticsearch(**indexer.es_client_args)
     index_name = ("hub_tmp_%s" % get_random_string()).lower()
     try:
         return (yield from client.indices.create(
             index_name,
             body={
                 "settings":
                 (yield from
                  indexer.es_index_settings.finalize(client)),
                 "mappings":
                 (yield from indexer.es_index_mappings.finalize(client))
             }))
     finally:
         yield from client.indices.delete(index_name,
                                          ignore_unavailable=True)
         yield from client.close()
Ejemplo n.º 7
0
    def defer_to_process(self, pinfo=None, func=None, *args):
        @asyncio.coroutine
        def run(future, job_id):
            nonlocal pinfo
            yield from self.check_constraints(pinfo)
            self.ok_to_run.release()
            # pinfo can contain predicates hardly pickleable during run_in_executor
            # but we also need not to touch the original one
            copy_pinfo = copy.deepcopy(pinfo)
            copy_pinfo.pop("__predicates__", None)
            self.jobs[job_id] = copy_pinfo
            res = self.loop.run_in_executor(
                self.process_queue,
                partial(do_work, job_id, "process", copy_pinfo, func, *args))

            def ran(f):
                try:
                    # consume future, just to trigger potential exceptions
                    r = f.result()
                finally:
                    # whatever the result we want to make sure to clean the job registry
                    # to keep it sync with actual running jobs
                    self.jobs.pop(job_id)

            res.add_done_callback(ran)
            res = yield from res
            # process could generate other parallelized jobs and return a Future/Task
            # If so, we want to make sure we get the results from that task
            if type(res) == asyncio.Task:
                res = yield from res
            future.set_result(res)

        yield from self.ok_to_run.acquire()
        f = asyncio.Future()

        def runned(innerf, job_id):
            if innerf.exception():
                f.set_exception(innerf.exception())

        job_id = get_random_string()
        fut = asyncio.ensure_future(run(f, job_id))
        fut.add_done_callback(partial(runned, job_id=job_id))
        return f
Ejemplo n.º 8
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection_name and self.db[
             self.temp_collection_name].count() > 0:
         if self.collection_name in self.db.collection_names():
             # renaming existing collections
             new_name = '_'.join([
                 self.collection_name, 'archive',
                 get_timestamp(),
                 get_random_string()
             ])
             self.collection.rename(new_name, dropTarget=True)
         self.logger.info("Renaming collection '%s' to '%s'" %
                          (self.temp_collection_name, self.collection_name))
         self.db[self.temp_collection_name].rename(self.collection_name)
     else:
         raise ResourceError("No temp collection (or it's empty)")
Ejemplo n.º 9
0
    def defer_to_thread(self, pinfo=None, func=None, *args):

        skip_check = pinfo.get("__skip_check__", False)

        @asyncio.coroutine
        def run(future, job_id):
            if not skip_check:
                yield from self.check_constraints(pinfo)
                self.ok_to_run.release()
            self.jobs[job_id] = pinfo
            res = self.loop.run_in_executor(
                self.thread_queue,
                partial(do_work, job_id, "thread", pinfo, func, *args))

            def ran(f):
                try:
                    r = f.result()
                finally:
                    # whatever the result we want to make sure to clean the job registry
                    # to keep it sync with actual running jobs
                    self.jobs.pop(job_id)

            res.add_done_callback(ran)
            res = yield from res
            # thread could generate other parallelized jobs and return a Future/Task
            # If so, we want to make sure we get the results from that task
            if type(res) == asyncio.Task:
                res = yield from res
            future.set_result(res)

        if not skip_check:
            yield from self.ok_to_run.acquire()
        f = asyncio.Future()

        def runned(innerf, job_id):
            if innerf.exception():
                f.set_exception(innerf.exception())

        job_id = get_random_string()
        fut = asyncio.ensure_future(run(f, job_id))
        fut.add_done_callback(partial(runned, job_id=job_id))
        return f
Ejemplo n.º 10
0
def backup(folder=".", archive=None):
    """
    Dump the whole hub_db database in given folder. "archive" can be pass
    to specify the target filename, otherwise, it's randomly generated
    Note: this doesn't backup source/merge data, just the internal data
          used by the hub
    """
    # get database name (ie. hub_db internal database)
    db_name = get_src_dump().database.name
    dump = {}
    for getter in [
            get_src_dump, get_src_master, get_src_build, get_src_build_config,
            get_data_plugin, get_api, get_cmd, get_event, get_hub_config
    ]:
        col = getter()
        dump[col.name] = []
        for doc in col.find():
            dump[col.name].append(doc)
    if not archive:
        archive = "backup_%s_%s.pyobj" % (get_timestamp(), get_random_string())
    path = os.path.join(folder, archive)
    dumpobj(dump, path)
    return path
Ejemplo n.º 11
0
    def func_wrapper(*args, **kwargs):
        ptype = args[0]  # tracking process or thread ?
        # we're looking for some "pinfo" value (process info) to later
        # reporting. If we can't find any, we'll try our best to figure out
        # what this is about...
        # func is the do_work wrapper, we want the actual partial
        # is first arg a callable (func) or pinfo ?
        if callable(args[1]):
            innerfunc = args[1]
            innerargs = args[2:]
            pinfo = None
        else:
            innerfunc = args[2]
            innerargs = args[3:]
            pinfo = args[1]

        # make sure we can pickle the whole thing (and it's
        # just informative, so stringify is just ok there)
        innerargs = [str(arg) for arg in innerargs]
        if type(innerfunc) == partial:
            fname = innerfunc.func.__name__
        elif type(innerfunc) == types.MethodType:
            fname = innerfunc.__self__.__class__.__name__
        else:
            fname = innerfunc.__name__

        firstarg = innerargs and innerargs[0] or ""
        if not pinfo:
            pinfo = {
                "category": None,
                "source": None,
                "step": None,
                "description": "%s %s" % (fname, firstarg)
            }

        worker = {
            'func_name': fname,
            'args': innerargs,
            'kwargs': kwargs,
            'started_at': time.time(),
            'info': pinfo
        }
        results = None
        exc = None
        trace = None
        try:
            _id = None
            rnd = get_random_string()
            if ptype == "thread":
                _id = "%s" % threading.current_thread().getName()
            else:
                _id = os.getpid()
            # add random chars: 2 jobs handled by the same slot (pid or thread)
            # would override filename otherwise
            fn = "%s_%s" % (_id, rnd)
            worker["info"]["id"] = _id
            pidfile = os.path.join(config.RUN_DIR, "%s.pickle" % fn)
            pickle.dump(worker, open(pidfile, "wb"))
            results = func(*args, **kwargs)
        except Exception as e:
            import traceback
            trace = traceback.format_exc()
            logger.error("err %s\n%s" % (e, trace))
            # we want to store exception so for now, just make a reference
            exc = e
        finally:
            if os.path.exists(pidfile):
                pass
                # move to "done" dir and register end of execution time
                os.rename(
                    pidfile,
                    os.path.join(config.RUN_DIR, "done",
                                 os.path.basename(pidfile)))
                pidfile = os.path.join(config.RUN_DIR, "done",
                                       os.path.basename(pidfile))
                worker = pickle.load(open(pidfile, "rb"))
                worker["duration"] = timesofar(worker["started_at"])
                worker["err"] = exc
                worker["trace"] = trace
                # try to keep original exception, but this may fail depending on
                # what's in the exception. If we can't, keep the string representation
                try:
                    pickle.dump(worker, open(pidfile, "wb"))
                except Exception:
                    worker["err"] = str(exc)
                    pickle.dump(worker, open(pidfile, "wb"))
        # now raise original exception
        if exc:
            raise exc
        return results
Ejemplo n.º 12
0
def id_feeder(col,
              batch_size=1000,
              build_cache=True,
              logger=logging,
              force_use=False,
              force_build=False,
              validate_only=False):
    """Return an iterator for all _ids in collection "col"
       Search for a valid cache file if available, if not
       return a doc_feeder for that collection. Valid cache is
       a cache file that is newer than the collection.
       "db" can be "target" or "src".
       "build_cache" True will build a cache file as _ids are fetched, 
       if no cache file was found
       "force_use" True will use any existing cache file and won't check whether
       it's valid of not.
       "force_build" True will build a new cache even if current one exists
       and is valid.
       "validate_only" will directly return [] if the cache is valid (convenient
       way to check if the cache is valid)
    """
    src_db = get_src_db()
    ts = None
    found_meta = True

    if isinstance(col, DocMongoBackend):
        col = col.target_collection

    try:
        if col.database.name == config.DATA_TARGET_DATABASE:
            info = src_db["src_build"].find_one({"_id": col.name})
            if not info:
                logger.warning(
                    "Can't find information for target collection '%s'" %
                    col.name)
            else:
                ts = info.get("_meta", {}).get("build_date")
                ts = ts and dtparser.parse(ts).timestamp()
        elif col.database.name == config.DATA_SRC_DATABASE:
            src_dump = get_src_dump()
            info = src_dump.find_one({
                "$where":
                "function() {if(this.upload) {for(var index in this.upload.jobs) {if(this.upload.jobs[index].step == \"%s\") return this;}}}"
                % col.name
            })
            if not info:
                logger.warning(
                    "Can't find information for source collection '%s'" %
                    col.name)
            else:
                ts = info["upload"]["jobs"][col.name]["started_at"].timestamp()
        else:
            logging.warning(
                "Can't find metadata for collection '%s' (not a target, not a source collection)"
                % col)
            found_meta = False
            build_cache = False
    except KeyError:
        logger.warning("Couldn't find timestamp in database for '%s'" %
                       col.name)
    except Exception as e:
        logger.info(
            "%s is not a mongo collection, _id cache won't be built (error: %s)"
            % (col, e))
        build_cache = False

    # try to find a cache file
    use_cache = False
    cache_file = None
    cache_format = getattr(config, "CACHE_FORMAT", None)
    if found_meta and getattr(config, "CACHE_FOLDER", None):
        cache_file = get_cache_filename(col.name)
        try:
            # size of empty file differs depending on compression
            empty_size = {None: 0, "xz": 32, "gzip": 25, "bz2": 14}
            if force_build:
                logger.warning("Force building cache file")
                use_cache = False
            # check size, delete if invalid
            elif os.path.getsize(cache_file) <= empty_size.get(
                    cache_format, 32):
                logger.warning("Cache file exists but is empty, delete it")
                os.remove(cache_file)
            elif force_use:
                use_cache = True
                logger.info("Force using cache file")
            else:
                mt = os.path.getmtime(cache_file)
                if ts and mt >= ts:
                    dtmt = datetime.datetime.fromtimestamp(mt).isoformat()
                    dtts = datetime.datetime.fromtimestamp(ts).isoformat()
                    logging.debug(
                        "Cache is valid, modiftime_cache:%s >= col_timestamp:%s"
                        % (dtmt, dtts))
                    use_cache = True
                else:
                    logger.info("Cache is too old, discard it")
        except FileNotFoundError:
            pass
    if use_cache:
        logger.debug("Found valid cache file for '%s': %s" %
                     (col.name, cache_file))
        if validate_only:
            logging.debug("Only validating cache, now return")
            return []
        with open_compressed_file(cache_file) as cache_in:
            if cache_format:
                iocache = io.TextIOWrapper(cache_in)
            else:
                iocache = cache_in
            for ids in iter_n(iocache, batch_size):
                yield [_id.strip() for _id in ids if _id.strip()]
    else:
        logger.debug(
            "No cache file found (or invalid) for '%s', use doc_feeder" %
            col.name)
        cache_out = None
        cache_temp = None
        if getattr(config, "CACHE_FOLDER",
                   None) and config.CACHE_FOLDER and build_cache:
            if not os.path.exists(config.CACHE_FOLDER):
                os.makedirs(config.CACHE_FOLDER)
            cache_temp = "%s._tmp_" % cache_file
            # clean aborted cache file generation
            for tmpcache in glob.glob(
                    os.path.join(config.CACHE_FOLDER, "%s*" % cache_temp)):
                logger.info("Removing aborted cache file '%s'" % tmpcache)
                os.remove(tmpcache)
            # use temp file and rename once done
            cache_temp = "%s%s" % (cache_temp, get_random_string())
            cache_out = get_compressed_outfile(cache_temp,
                                               compress=cache_format)
            logger.info("Building cache file '%s'" % cache_temp)
        else:
            logger.info(
                "Can't build cache, cache not allowed or no cache folder")
            build_cache = False
        if isinstance(col, Collection):
            doc_feeder_func = partial(doc_feeder,
                                      col,
                                      step=batch_size,
                                      inbatch=True,
                                      fields={"_id": 1})
        elif isinstance(col, DocMongoBackend):
            doc_feeder_func = partial(doc_feeder,
                                      col.target_collection,
                                      step=batch_size,
                                      inbatch=True,
                                      fields={"_id": 1})
        elif isinstance(col, DocESBackend):
            # get_id_list directly return the _id, wrap it to match other
            # doc_feeder_func returned vals. Also return a batch of id
            def wrap_id():
                ids = []
                for _id in col.get_id_list(step=batch_size):
                    ids.append({"_id": _id})
                    if len(ids) >= batch_size:
                        yield ids
                        ids = []
                if ids:
                    yield ids

            doc_feeder_func = partial(wrap_id)
        else:
            raise Exception("Unknown backend %s" % col)
        for doc_ids in doc_feeder_func():
            doc_ids = [str(_doc["_id"]) for _doc in doc_ids]
            if build_cache:
                strout = "\n".join(doc_ids) + "\n"
                if cache_format:
                    # assuming binary format (b/ccompressed)
                    cache_out.write(strout.encode())
                else:
                    cache_out.write(strout)
            yield doc_ids
        if build_cache:
            cache_out.close()
            cache_final = os.path.splitext(cache_temp)[0]
            os.rename(cache_temp, cache_final)
Ejemplo n.º 13
0
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(),
                                      get_random_string()).lower()
Ejemplo n.º 14
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
Ejemplo n.º 15
0
 def generate_target_name(self, build_config_name):
     assert build_config_name is not None
     return '{}_{}_{}'.format(build_config_name, get_timestamp(),
                              get_random_string()).lower()
Ejemplo n.º 16
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result
Ejemplo n.º 17
0
 def generate_target_name(self, build_config_name):
     return 'genedoc_{}_{}_{}'.format(build_config_name, get_timestamp(),
                                      get_random_string()).lower()
Ejemplo n.º 18
0
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(), get_random_string()).lower()