Esempio n. 1
0
 def load_data(self,data_folder):
     self.logger.info("Load data from '%s'" % data_folder)
     input_file = os.path.join(data_folder,"ChEBI_complete.sdf")
     # get others source collection for inchi key conversion
     drugbank_col = get_src_db()["drugbank"]
     assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \
             "conversion). Please run 'drugbank' uploader first"
     chembl_col = get_src_db()["chembl"]
     assert chembl_col.count() > 0, "'chembl' collection is empty (required for inchikey " + \
             "conversion). Please run 'chembl' uploader first"
     assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
     return self.exclude_fields(self.keylookup(load_data))(input_file)
Esempio n. 2
0
    def __init__(self, build_config=None, backend='mongodb'):
        self.src = get_src_db()
        self.step = 10000
        self.use_parallel = False
        self.merge_logging = True  # save output into a logging file when merge is called.
        self.max_build_status = 10  # max no. of records kept in "build" field of src_build collection.

        self.using_ipython_cluster = False
        self.shutdown_ipengines_after_done = False
        self.log_folder = LOG_FOLDER

        self._build_config = build_config
        self._entrez_geneid_d = None
        self._idmapping_d_cache = {}

        self.get_src_master()

        if backend == 'mongodb':
            self.target = databuild.backend.GeneDocMongoDBBackend()
        elif backend == 'es':
            self.target = databuild.backend.GeneDocESBackend(ESIndexer())
        elif backend == 'couchdb':
            from config import COUCHDB_URL
            import couchdb
            self.target = databuild.backend.GeneDocCouchDBBackend(
                couchdb.Server(COUCHDB_URL))
        elif backend == 'memory':
            self.target = databuild.backend.GeneDocMemeoryBackend()
        else:
            raise ValueError('Invalid backend "%s".' % backend)
Esempio n. 3
0
 def resolve_sources(self, sources):
     """
     Source can be a string that may contain regex chars. It's usefull
     when you have plenty of sub-collections prefixed with a source name.
     For instance, given a source named 'blah' stored in as many collections
     as chromosomes, insteand of passing each name as 'blah_1', 'blah_2', etc...
     'blah_.*' can be specified in build_config. This method resolves potential
     regexed source name into real, existing collection names
     """
     if type(sources) == str:
         sources = [sources]
     src_db = mongo.get_src_db()
     cols = src_db.collection_names()
     masters = self.source_backend.get_src_master_docs()
     found = []
     for src in sources:
         # check if master _id and name are different (meaning name is a regex)
         master = masters.get(src)
         if not master:
             raise BuilderException("'%s'could not be found in master documents (%s)" % \
                     (src,repr(list(masters.keys()))))
         search = src
         if master["_id"] != master["name"]:
             search = master["name"]
         # restrict pattern to minimal match
         pat = re.compile("^%s$" % search)
         for col in cols:
             if pat.match(col):
                 found.append(col)
     return found
Esempio n. 4
0
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num):
    try:
        src = mongo.get_src_db()
        tgt = mongo.get_target_db()
        col = src[col_name]
        #if batch_num == 2:
        #    raise ValueError("oula pa bon")
        dest = DocMongoBackend(tgt, tgt[dest_name])
        cur = doc_feeder(col,
                         step=len(ids),
                         inbatch=False,
                         query={'_id': {
                             '$in': ids
                         }})
        mapper.load()
        docs = mapper.process(cur)
        cnt = dest.update(docs, upsert=upsert)
        return cnt
    except Exception as e:
        logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num)
        logger = get_logger(logger_name, btconfig.LOG_FOLDER)
        logger.exception(e)
        exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name)
        pickle.dump(e, open(exc_fn, "wb"))
        logger.info("Exception was dumped in pickle file '%s'" % exc_fn)
        raise
Esempio n. 5
0
 def do(srcs, tgt):
     pinfo = {
         "category": "cache",
         "source": None,
         "step": "rebuild",
         "description": ""
     }
     config.logger.info("Rebuild cache for sources: %s, target: %s" %
                        (srcs, tgt))
     for src in srcs:
         # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
         if "." in src:
             src = src.split(".")[1]
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(
             pinfo, partial(rebuild, col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild, col))
         yield from job
Esempio n. 6
0
    def __init__(self, build_config=None, backend='mongodb'):
        self.src = get_src_db()
        self.step = 10000
        self.use_parallel = False
        self.merge_logging = True     # save output into a logging file when merge is called.
        self.max_build_status = 10    # max no. of records kept in "build" field of src_build collection.

        self.using_ipython_cluster = False
        self.shutdown_ipengines_after_done = False
        self.log_folder = LOG_FOLDER

        self._build_config = build_config
        self._entrez_geneid_d = None
        self._idmapping_d_cache = {}

        self.get_src_master()

        if backend == 'mongodb':
            self.target = databuild.backend.GeneDocMongoDBBackend()
        elif backend == 'es':
            self.target = databuild.backend.GeneDocESBackend(ESIndexer())
        elif backend == 'couchdb':
            from config import COUCHDB_URL
            import couchdb
            self.target = databuild.backend.GeneDocCouchDBBackend(couchdb.Server(COUCHDB_URL))
        elif backend == 'memory':
            self.target = databuild.backend.GeneDocMemeoryBackend()
        else:
            raise ValueError('Invalid backend "%s".' % backend)
Esempio n. 7
0
 def prepare_collection(self):
     """
     Load the mongodb collection specified by collection_name.
     :return:
     """
     self._state["collection"] = mongo.get_src_db()[self.collection_name]
     self.logger.info("Registering collection:  {}".format(
         self.collection_name))
Esempio n. 8
0
 def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
     self._es = get_es(es_host)
     self._index = index or config.ES_INDEX_NAME
     self._doc_type = doc_type or config.ES_DOC_TYPE
     self._esi = ESIndexer(es_host=es_host)
     self._esi._index = self._index
     self._src = get_src_db()
     self.step = step
Esempio n. 9
0
 def load_data(self, data_folder):
     input_file = os.path.join(data_folder,
                               "merged_freq_all_se_indications.tsv")
     self.logger.info("Load data from file '%s'" % input_file)
     pubchem_col = get_src_db()["pubchem"]
     assert pubchem_col.count() > 0, "'pubchem' collection is empty (required for inchikey " + \
             "conversion). Please run 'pubchem' uploader first"
     return load_data(input_file, pubchem_col)
Esempio n. 10
0
 def load_data(self, data_folder):
     self.logger.info("Load data from '%s'" % data_folder)
     input_file = os.path.join(data_folder, "drugs.tsv")
     assert os.path.exists(
         input_file), "Can't find input file '%s'" % input_file
     # get others source collection for inchi key conversion
     drugbank_col = get_src_db()["drugbank"]
     assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \
             "conversion). Please run 'drugbank' uploader first"
     pubchem_col = get_src_db()["pubchem"]
     assert pubchem_col.count() > 0, "'pubchem' collection is empty (required for inchikey " + \
             "conversion). Please run 'pubchem' uploader first"
     chembl_col = get_src_db()["chembl"]
     assert chembl_col.count() > 0, "'chembl' collection is empty (required for inchikey " + \
             "conversion). Please run 'chembl' uploader first"
     chebi_col = get_src_db()["chebi"]
     assert chebi_col.count() > 0, "'chebi' collection is empty (required for inchikey " + \
             "conversion). Please run 'chebi' uploader first"
     return load_data(input_file, drugbank_col, pubchem_col, chembl_col,
                      chebi_col)
Esempio n. 11
0
 def load(self):
     if self.cache is None:
         col = mongo.get_src_db()[TaxonomyNodesUploader.name]
         self.cache = {}
         [
             self.cache.setdefault(d["taxid"], d["parent_taxid"])
             for d in col.find({}, {
                 "parent_taxid": 1,
                 "taxid": 1
             })
         ]
Esempio n. 12
0
    def validate_src(self,
                     collection,
                     return_false=False,
                     return_none=False,
                     return_true=False,
                     verbose=False,
                     flag_invalid=False,
                     generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}  # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id},
                                  {'$set': {
                                      "unmatched_ref": "True"
                                  }})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Esempio n. 13
0
def create_backend(db_col_names,name_only=False,**kwargs):
    """
    Guess what's inside 'db_col_names' and return the corresponding backend.
    - It could be a string (by default, will lookup a mongo collection in target database)
    - or a tuple("target|src","col_name")
    - or a ("mongodb://*****:*****@host","db","col_name") URI.
    - or a ("es_host:port","index_name","doc_type")
    If name_only is true, just return the name uniquely identifying the collection or index
    URI connection.
    """
    col = None
    db = None
    is_mongo = True
    if type(db_col_names) == str:
        db = mongo.get_target_db()
        col = db[db_col_names]
        # normalize params
        db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name]
    elif db_col_names[0].startswith("mongodb://"):
        assert len(db_col_names) == 3, "Missing connection information for %s" % repr(db_col_names)
        conn = mongo.MongoClient(db_col_names[0])
        db = conn[db_col_names[1]]
        col = db[db_col_names[2]]
        # normalize params
        db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name]
    elif len(db_col_names) == 3 and ":" in db_col_names[0]:
        is_mongo = False
        idxr = ESIndexer(index=db_col_names[1],doc_type=db_col_names[2],es_host=db_col_names[0],**kwargs)
        db = idxr
        col = db_col_names[1]
    else:
        assert len(db_col_names) == 2, "Missing connection information for %s" % repr(db_col_names)
        db = db_col_names[0] == "target" and mongo.get_target_db() or mongo.get_src_db()
        col = db[db_col_names[1]]
        # normalize params (0:host, 1:port)
        db_col_names = ["%s:%s" % (db.client.address[0],db.client.address[1]),db.name,col.name]
    assert not col is None, "Could not create collection object from %s" % repr(db_col_names)
    if name_only:
        if is_mongo:
            return "mongo_%s_%s_%s" % (db_col_names[0].replace(":","_"),
                                      db_col_names[1],db_col_names[2])
        else:
            return "es_%s_%s_%s" % (db_col_names[0].replace(":","_"),
                                    db_col_names[1],db_col_names[2])
    else:
        if is_mongo:
            return DocMongoBackend(db,col)
        else:
            return DocESBackend(db)
Esempio n. 14
0
def load_data(input_file):
    src_db = mongo.get_src_db()
    if not "dbsnp_hg19" in src_db.collection_names():
        raise ValueError("'dbsnp_hg19' collection is missing, run dbsnp uploader first")
    dbsnp_col = src_db["dbsnp_hg19"]
    open_file = open(input_file,encoding="cp1252")
    open_file = csv.reader(open_file, delimiter="\t")
    next(open_file)
    grasp = map(row_generator, open_file)
    grasp = filter(lambda row: row[58] != "", grasp)
    json_rows = map(partial(_map_line_to_json,dbsnp_col=dbsnp_col), grasp)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups):
        yield row
Esempio n. 15
0
 def _load_collections(self, collections):
     """
     Load all mongodb collections specified in the configuration data structure col_keys.
     :return:
     """
     self.collections = {}
     for col in collections:
         collection = mongo.get_src_db()[col]
         if collection.count() > 0:
             self.collections[col] = collection
             kl_log.info("Registering collection:  {} (count:  {})".format(
                 col, collection.count()))
     if not self.collections:
         raise ValueError(
             "At least one configured collection is required for MongoDB key lookup."
         )
def load_data(input_file):
    src_db = mongo.get_src_db()
    if not "dbsnp_hg19" in src_db.collection_names():
        raise ValueError(
            "'dbsnp_hg19' collection is missing, run dbsnp uploader first")
    dbsnp_col = src_db["dbsnp_hg19"]
    open_file = open(input_file, encoding="cp1252")
    open_file = csv.reader(open_file, delimiter="\t")
    next(open_file)
    grasp = map(row_generator, open_file)
    grasp = filter(lambda row: row[58] != "", grasp)
    json_rows = map(partial(_map_line_to_json, dbsnp_col=dbsnp_col), grasp)
    json_rows = (row for row in json_rows if row)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups):
        yield row
Esempio n. 17
0
    def validate_src(self, collection, return_false=False,
                     return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}    # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Esempio n. 18
0
 def __init__(self, log_dir=None, date=None, dry_run=False):
     self.log_dir = log_dir if log_dir else os.getcwd()
     d = datetime.now()
     self.date = date if date else "".join(map(str, [d.year, d.month, d.day]))
     self.dry_run = dry_run
     self.login_instance = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS)
     self.fast_run_base_filter = {self.DOID_PROP: ''}
     self.info_log_path = None
     self.exc_log_path = None
     self.reference = None
     self.setup_logging()
     self.collection = get_src_db().mondo
     src_dump = get_src_dump()
     src_doc = src_dump.find_one({'_id': 'mondo'}) or {}
     self.retrieved = src_doc.get("download", {}).get("started_at", False) or datetime.now()
     self.ref_url = "https://github.com/monarch-initiative/monarch-disease-ontology/raw/{}/src/mondo/mondo.obo".format(
         src_doc.get("release", "master"))
     self.create_reference()
Esempio n. 19
0
 def do(srcs,tgt):
     pinfo = {"category" : "cache",
             "source" : None,
             "step" : "rebuild",
             "description" : ""}
     config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt))
     for src in srcs:
         # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
         if "." in src:
             src = src.split(".")[1]
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(pinfo, partial(rebuild,col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild,col))
         yield from job
Esempio n. 20
0
 def do(srcs, tgt):
     pinfo = {
         "category": "cache",
         "source": None,
         "step": "rebuild",
         "description": ""
     }
     config.logger.info("Rebuild cache for sources: %s, target: %s" %
                        (srcs, tgt))
     for src in srcs:
         config.logger.info("Rebuilding cache for source '%s'" % src)
         col = mongo.get_src_db()[src]
         pinfo["source"] = src
         job = yield from job_manager.defer_to_thread(
             pinfo, partial(rebuild, col))
         yield from job
         config.logger.info("Done rebuilding cache for source '%s'" % src)
     if tgt:
         config.logger.info("Rebuilding cache for target '%s'" % tgt)
         col = mongo.get_target_db()[tgt]
         pinfo["source"] = tgt
         job = job_manager.defer_to_thread(pinfo, partial(rebuild, col))
         yield from job
Esempio n. 21
0
def export_ids(col_name):
    """
    Export all _ids from collection named col_name.
    If col_name refers to a build where a cold_collection is defined,
    will also extract _ids and sort/uniq them to have the full list of _ids
    of the actual merged (cold+hot) collection
    Output file is stored in DATA_EXPORT_FOLDER/ids,
    defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is
    returned as the end, if successful.
    """
    # prepare output directory
    DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None)
    if not DATA_EXPORT_FOLDER:
        DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export")
    ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids")
    if not os.path.exists(ids_export_folder):
        logging.debug("Creating export/ids folder: %s" % ids_export_folder)
        os.makedirs(ids_export_folder)
    build = get_src_build().find_one({"_id":col_name})
    cold = None
    if build:
        col = get_target_db()[col_name]
        if build.get("build_config",{}).get("cold_collection"):
            cold_name = build["build_config"]["cold_collection"]
            cold = get_target_db()[cold_name]
            logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name))
    else:
        # it's a src
        col = get_src_db()[col_name]
    
    # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore,
    # so we're sure to work with latest data. If cache is valid, this will be pretty fast
    logging.info("Screening _ids in collection '%s'" % col.name)
    for _id in id_feeder(col,validate_only=True):
        pass
    # now accessing cache
    col_ids_cache = get_cache_filename(col.name)
    assert os.path.exists(col_ids_cache)
    logging.info("Now using cache file %s" % col_ids_cache)
    if cold:
        logging.info("Screening _ids in cold collection '%s'" % cold.name)
        for _id in id_feeder(cold,validate_only=True):
            pass
        # now accessing cache
        cold_ids_cache = get_cache_filename(cold.name)
        assert os.path.exists(cold_ids_cache)
        logging.info("Now using cache file %s" % cold_ids_cache)
    outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name)
    # NOTE: can't use anyfile to open cache files and send _id through pipes
    # because it would load _id in memory (unless using hacks) so use cat (and
    # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe
    # on the shell
    if cold:
        fout = anyfile(outfn,"wb")
        colext = os.path.splitext(col_ids_cache)[1]
        coldext = os.path.splitext(cold_ids_cache)[1]
        assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext)
        comp = colext.replace(".","")
        supportedcomps = ["xz","gz",""] # no compression allowed as well
        assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps)
        # IDs sent to pipe's input (sort) then compress it (xz)
        pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE)
        psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True)
        pcat.stdout.close() # will raise end of pipe error when finished
        if comp:
            pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout)
        else:
            # just print stdin to stdout
            pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout)
        psort.stdout.close()
        try:
            logging.info("Running pipe to compute list of unique _ids")
            (out,err) = pcomp.communicate() # run the pipe! (blocking)
            if err:
                raise Exception(err)
        except Exception as e:
            logging.error("Error while running pipe to export _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise
    else:
        logging.info("Copying cache _id file")
        try:
            shutil.copyfile(col_ids_cache,outfn)
        except Exception as e:
            logging.error("Error while exporting _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise

    logging.info("Done exporting _ids to '%s'" % outfn)
    return outfn
Esempio n. 22
0
 def __init__(self, db, dest_col_name, logger=logging):
     db = db or get_src_db()
     self.temp_collection = db[dest_col_name]
     self.logger = logger
Esempio n. 23
0
def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs):
    """
    Guess what's inside 'db_col_names' and return the corresponding backend.
    - It could be a string (will first check for an src_build doc to check
      a backend_url field, if nothing there, will lookup a mongo collection
      in target database)
    - or a tuple("target|src","col_name")
    - or a ("mongodb://*****:*****@host","db","col_name") URI.
    - or a ("es_host:port","index_name","doc_type")
    If name_only is true, just return the name uniquely identifying the collection or index
    URI connection.
    """
    col = None
    db = None
    is_mongo = True
    if type(db_col_names) == str:
        # first check build doc, if there's backend_url key, we'll use it instead of
        # direclty using db_col_names as target collection (see LinkDataBuilder)
        bdoc = get_src_build().find_one({"_id": db_col_names})
        if follow_ref and bdoc and bdoc.get(
                "backend_url") and bdoc["backend_url"] != db_col_names:
            return create_backend(bdoc["backend_url"],
                                  name_only=name_only,
                                  follow_ref=follow_ref,
                                  **kwargs)
        else:
            db = mongo.get_target_db()
            col = db[db_col_names]
            # normalize params
            db_col_names = [
                "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name
            ]
    elif db_col_names[0].startswith("mongodb://"):
        assert len(
            db_col_names
        ) == 3, "Missing connection information for %s" % repr(db_col_names)
        conn = mongo.MongoClient(db_col_names[0])
        db = conn[db_col_names[1]]
        col = db[db_col_names[2]]
        # normalize params
        db_col_names = [
            "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name
        ]
    elif len(db_col_names) == 3 and ":" in db_col_names[0]:
        is_mongo = False
        idxr = ESIndexer(index=db_col_names[1],
                         doc_type=db_col_names[2],
                         es_host=db_col_names[0],
                         **kwargs)
        db = idxr
        col = db_col_names[1]
    else:
        assert len(
            db_col_names
        ) == 2, "Missing connection information for %s" % repr(db_col_names)
        db = db_col_names[0] == "target" and mongo.get_target_db(
        ) or mongo.get_src_db()
        col = db[db_col_names[1]]
        # normalize params (0:host, 1:port)
        db_col_names = [
            "%s:%s" % (db.client.address[0], db.client.address[1]), db.name,
            col.name
        ]
    assert col is not None, "Could not create collection object from %s" % repr(
        db_col_names)
    if name_only:
        if is_mongo:
            return "mongo_%s_%s_%s" % (db_col_names[0].replace(
                ":", "_"), db_col_names[1], db_col_names[2])
        else:
            return "es_%s_%s_%s" % (db_col_names[0].replace(
                ":", "_"), db_col_names[1], db_col_names[2])
    else:
        if is_mongo:
            return DocMongoBackend(db, col)
        else:
            return DocESBackend(db)
Esempio n. 24
0
 def __init__(self):
     self._src = get_src_db()
Esempio n. 25
0
 def __init__(self, db_info, dest_col_name, logger):
     db = get_src_db()
     self.temp_collection = db[dest_col_name]
     self.logger = logger
Esempio n. 26
0
 def load(self):
     if self.cache is None:
         col = mongo.get_src_db()[TaxonomyNodesUploader.name]
         self.cache = {}
         [self.cache.setdefault(d["taxid"],d["parent_taxid"]) for d in col.find({},{"parent_taxid":1,"taxid":1})]
Esempio n. 27
0
    def setUp(self):
        """
        Setup the mongodb structure for the tests
        :return:
        """

        # Collections for the first test
        self.db = mongo.get_src_db()
        self.db.create_collection('a')
        self.db.create_collection('b')
        self.db.create_collection('c')
        self.db.create_collection('d')
        self.db.create_collection('e')

        self.db['b'].insert({'b_id': 'b:1234', 'a_id': 'a:1234'})
        self.db['c'].insert({
            'c_id': 'c:1234',
            'b_id': 'b:1234',
            'e_id': 'e:1234'
        })
        self.db['d'].insert({'d_id': 'd:1234', 'c_id': 'c:1234'})
        self.db['e'].insert({'e_id': 'e:1234', 'd_id': 'd:1234'})

        # Collections for the second test (one2many)
        self.db.create_collection('aa')
        self.db.create_collection('bb')
        self.db.create_collection('cc')

        self.db['bb'].insert({'b_id': 'b:1234', 'a_id': 'a:1234'})
        self.db['bb'].insert({'b_id': 'b:5678', 'a_id': 'a:1234'})
        self.db['cc'].insert({'c_id': 'c:1234', 'b_id': 'b:1234'})
        self.db['cc'].insert({'c_id': 'c:01', 'b_id': 'b:5678'})
        self.db['cc'].insert({'c_id': 'c:02', 'b_id': 'b:5678'})

        # Collections for the path weight test
        self.db = mongo.get_src_db()
        self.db.create_collection('aaa')
        self.db.create_collection('bbb')
        self.db.create_collection('ccc')
        self.db.create_collection('ddd')
        self.db.create_collection('eee')

        self.db['bbb'].insert({
            'b_id': 'b:1234',
            'a_id': 'a:1234',
            'e_id': 'e:5678'
        })
        self.db['ccc'].insert({'c_id': 'c:1234', 'b_id': 'b:1234'})
        self.db['ddd'].insert({'d_id': 'd:1234', 'c_id': 'c:1234'})
        self.db['eee'].insert({'e_id': 'e:1234', 'd_id': 'd:1234'})

        # Collections for the mix mongodb and api test
        self.db = mongo.get_src_db()
        self.db.create_collection('mix1')
        self.db.create_collection('mix3')

        self.db['mix1'].insert({
            'ensembl': 'ENSG00000123374',
            'start_id': 'start1'
        })
        self.db['mix3'].insert({'end_id': 'end1', 'entrez': '1017'})

        # Collections for lookup failure
        self.db['b'].insert({'b_id': 'b:f1', 'a_id': 'a:f1'})
        self.db['c'].insert({'c_id': 'c:f1', 'b_id': 'b:f1'})
        self.db['d'].insert({'d_id': 'd:fail1', 'c_id': 'c:f1'})
        self.db['e'].insert({'e_id': 'e:f1', 'd_id': 'd:f1'})
Esempio n. 28
0
 def __init__(self):
     self._src = get_src_db()
Esempio n. 29
0
 def load_data(self, data_folder):
     drugbank_col = get_src_db()["drugbank"]
     assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \
             "conversion). Please run 'drugbank' uploader first"
     return load_data(data_folder, drugbank_col)
Esempio n. 30
0
 def load(self):
     if self.cache is None:
         # this is a whole dict containing all taxonomu _ids
         col = mongo.get_src_db()[GeneInfoUploader.name]
         self.cache = [d["_id"] for d in col.find({},{"_id":1})]
Esempio n. 31
0
 def load(self):
     if self.cache is None:
         # this is a whole dict containing all taxonomu _ids
         col = mongo.get_src_db()[GeneInfoUploader.name]
         self.cache = [d["_id"] for d in col.find({},{"_id":1})]