def load_data(self,data_folder): self.logger.info("Load data from '%s'" % data_folder) input_file = os.path.join(data_folder,"ChEBI_complete.sdf") # get others source collection for inchi key conversion drugbank_col = get_src_db()["drugbank"] assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \ "conversion). Please run 'drugbank' uploader first" chembl_col = get_src_db()["chembl"] assert chembl_col.count() > 0, "'chembl' collection is empty (required for inchikey " + \ "conversion). Please run 'chembl' uploader first" assert os.path.exists(input_file), "Can't find input file '%s'" % input_file return self.exclude_fields(self.keylookup(load_data))(input_file)
def __init__(self, build_config=None, backend='mongodb'): self.src = get_src_db() self.step = 10000 self.use_parallel = False self.merge_logging = True # save output into a logging file when merge is called. self.max_build_status = 10 # max no. of records kept in "build" field of src_build collection. self.using_ipython_cluster = False self.shutdown_ipengines_after_done = False self.log_folder = LOG_FOLDER self._build_config = build_config self._entrez_geneid_d = None self._idmapping_d_cache = {} self.get_src_master() if backend == 'mongodb': self.target = databuild.backend.GeneDocMongoDBBackend() elif backend == 'es': self.target = databuild.backend.GeneDocESBackend(ESIndexer()) elif backend == 'couchdb': from config import COUCHDB_URL import couchdb self.target = databuild.backend.GeneDocCouchDBBackend( couchdb.Server(COUCHDB_URL)) elif backend == 'memory': self.target = databuild.backend.GeneDocMemeoryBackend() else: raise ValueError('Invalid backend "%s".' % backend)
def resolve_sources(self, sources): """ Source can be a string that may contain regex chars. It's usefull when you have plenty of sub-collections prefixed with a source name. For instance, given a source named 'blah' stored in as many collections as chromosomes, insteand of passing each name as 'blah_1', 'blah_2', etc... 'blah_.*' can be specified in build_config. This method resolves potential regexed source name into real, existing collection names """ if type(sources) == str: sources = [sources] src_db = mongo.get_src_db() cols = src_db.collection_names() masters = self.source_backend.get_src_master_docs() found = [] for src in sources: # check if master _id and name are different (meaning name is a regex) master = masters.get(src) if not master: raise BuilderException("'%s'could not be found in master documents (%s)" % \ (src,repr(list(masters.keys())))) search = src if master["_id"] != master["name"]: search = master["name"] # restrict pattern to minimal match pat = re.compile("^%s$" % search) for col in cols: if pat.match(col): found.append(col) return found
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num): try: src = mongo.get_src_db() tgt = mongo.get_target_db() col = src[col_name] #if batch_num == 2: # raise ValueError("oula pa bon") dest = DocMongoBackend(tgt, tgt[dest_name]) cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) mapper.load() docs = mapper.process(cur) cnt = dest.update(docs, upsert=upsert) return cnt except Exception as e: logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num) logger = get_logger(logger_name, btconfig.LOG_FOLDER) logger.exception(e) exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name) pickle.dump(e, open(exc_fn, "wb")) logger.info("Exception was dumped in pickle file '%s'" % exc_fn) raise
def do(srcs, tgt): pinfo = { "category": "cache", "source": None, "step": "rebuild", "description": "" } config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs, tgt)) for src in srcs: # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38) if "." in src: src = src.split(".")[1] config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread( pinfo, partial(rebuild, col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild, col)) yield from job
def __init__(self, build_config=None, backend='mongodb'): self.src = get_src_db() self.step = 10000 self.use_parallel = False self.merge_logging = True # save output into a logging file when merge is called. self.max_build_status = 10 # max no. of records kept in "build" field of src_build collection. self.using_ipython_cluster = False self.shutdown_ipengines_after_done = False self.log_folder = LOG_FOLDER self._build_config = build_config self._entrez_geneid_d = None self._idmapping_d_cache = {} self.get_src_master() if backend == 'mongodb': self.target = databuild.backend.GeneDocMongoDBBackend() elif backend == 'es': self.target = databuild.backend.GeneDocESBackend(ESIndexer()) elif backend == 'couchdb': from config import COUCHDB_URL import couchdb self.target = databuild.backend.GeneDocCouchDBBackend(couchdb.Server(COUCHDB_URL)) elif backend == 'memory': self.target = databuild.backend.GeneDocMemeoryBackend() else: raise ValueError('Invalid backend "%s".' % backend)
def prepare_collection(self): """ Load the mongodb collection specified by collection_name. :return: """ self._state["collection"] = mongo.get_src_db()[self.collection_name] self.logger.info("Registering collection: {}".format( self.collection_name))
def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer(es_host=es_host) self._esi._index = self._index self._src = get_src_db() self.step = step
def load_data(self, data_folder): input_file = os.path.join(data_folder, "merged_freq_all_se_indications.tsv") self.logger.info("Load data from file '%s'" % input_file) pubchem_col = get_src_db()["pubchem"] assert pubchem_col.count() > 0, "'pubchem' collection is empty (required for inchikey " + \ "conversion). Please run 'pubchem' uploader first" return load_data(input_file, pubchem_col)
def load_data(self, data_folder): self.logger.info("Load data from '%s'" % data_folder) input_file = os.path.join(data_folder, "drugs.tsv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file # get others source collection for inchi key conversion drugbank_col = get_src_db()["drugbank"] assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \ "conversion). Please run 'drugbank' uploader first" pubchem_col = get_src_db()["pubchem"] assert pubchem_col.count() > 0, "'pubchem' collection is empty (required for inchikey " + \ "conversion). Please run 'pubchem' uploader first" chembl_col = get_src_db()["chembl"] assert chembl_col.count() > 0, "'chembl' collection is empty (required for inchikey " + \ "conversion). Please run 'chembl' uploader first" chebi_col = get_src_db()["chebi"] assert chebi_col.count() > 0, "'chebi' collection is empty (required for inchikey " + \ "conversion). Please run 'chebi' uploader first" return load_data(input_file, drugbank_col, pubchem_col, chembl_col, chebi_col)
def load(self): if self.cache is None: col = mongo.get_src_db()[TaxonomyNodesUploader.name] self.cache = {} [ self.cache.setdefault(d["taxid"], d["parent_taxid"]) for d in col.find({}, { "parent_taxid": 1, "taxid": 1 }) ]
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set': { "unmatched_ref": "True" }}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def create_backend(db_col_names,name_only=False,**kwargs): """ Guess what's inside 'db_col_names' and return the corresponding backend. - It could be a string (by default, will lookup a mongo collection in target database) - or a tuple("target|src","col_name") - or a ("mongodb://*****:*****@host","db","col_name") URI. - or a ("es_host:port","index_name","doc_type") If name_only is true, just return the name uniquely identifying the collection or index URI connection. """ col = None db = None is_mongo = True if type(db_col_names) == str: db = mongo.get_target_db() col = db[db_col_names] # normalize params db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name] elif db_col_names[0].startswith("mongodb://"): assert len(db_col_names) == 3, "Missing connection information for %s" % repr(db_col_names) conn = mongo.MongoClient(db_col_names[0]) db = conn[db_col_names[1]] col = db[db_col_names[2]] # normalize params db_col_names = ["%s:%s" % (db.client.HOST,db.client.PORT),db.name,col.name] elif len(db_col_names) == 3 and ":" in db_col_names[0]: is_mongo = False idxr = ESIndexer(index=db_col_names[1],doc_type=db_col_names[2],es_host=db_col_names[0],**kwargs) db = idxr col = db_col_names[1] else: assert len(db_col_names) == 2, "Missing connection information for %s" % repr(db_col_names) db = db_col_names[0] == "target" and mongo.get_target_db() or mongo.get_src_db() col = db[db_col_names[1]] # normalize params (0:host, 1:port) db_col_names = ["%s:%s" % (db.client.address[0],db.client.address[1]),db.name,col.name] assert not col is None, "Could not create collection object from %s" % repr(db_col_names) if name_only: if is_mongo: return "mongo_%s_%s_%s" % (db_col_names[0].replace(":","_"), db_col_names[1],db_col_names[2]) else: return "es_%s_%s_%s" % (db_col_names[0].replace(":","_"), db_col_names[1],db_col_names[2]) else: if is_mongo: return DocMongoBackend(db,col) else: return DocESBackend(db)
def load_data(input_file): src_db = mongo.get_src_db() if not "dbsnp_hg19" in src_db.collection_names(): raise ValueError("'dbsnp_hg19' collection is missing, run dbsnp uploader first") dbsnp_col = src_db["dbsnp_hg19"] open_file = open(input_file,encoding="cp1252") open_file = csv.reader(open_file, delimiter="\t") next(open_file) grasp = map(row_generator, open_file) grasp = filter(lambda row: row[58] != "", grasp) json_rows = map(partial(_map_line_to_json,dbsnp_col=dbsnp_col), grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups): yield row
def _load_collections(self, collections): """ Load all mongodb collections specified in the configuration data structure col_keys. :return: """ self.collections = {} for col in collections: collection = mongo.get_src_db()[col] if collection.count() > 0: self.collections[col] = collection kl_log.info("Registering collection: {} (count: {})".format( col, collection.count())) if not self.collections: raise ValueError( "At least one configured collection is required for MongoDB key lookup." )
def load_data(input_file): src_db = mongo.get_src_db() if not "dbsnp_hg19" in src_db.collection_names(): raise ValueError( "'dbsnp_hg19' collection is missing, run dbsnp uploader first") dbsnp_col = src_db["dbsnp_hg19"] open_file = open(input_file, encoding="cp1252") open_file = csv.reader(open_file, delimiter="\t") next(open_file) grasp = map(row_generator, open_file) grasp = filter(lambda row: row[58] != "", grasp) json_rows = map(partial(_map_line_to_json, dbsnp_col=dbsnp_col), grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups): yield row
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def __init__(self, log_dir=None, date=None, dry_run=False): self.log_dir = log_dir if log_dir else os.getcwd() d = datetime.now() self.date = date if date else "".join(map(str, [d.year, d.month, d.day])) self.dry_run = dry_run self.login_instance = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS) self.fast_run_base_filter = {self.DOID_PROP: ''} self.info_log_path = None self.exc_log_path = None self.reference = None self.setup_logging() self.collection = get_src_db().mondo src_dump = get_src_dump() src_doc = src_dump.find_one({'_id': 'mondo'}) or {} self.retrieved = src_doc.get("download", {}).get("started_at", False) or datetime.now() self.ref_url = "https://github.com/monarch-initiative/monarch-disease-ontology/raw/{}/src/mondo/mondo.obo".format( src_doc.get("release", "master")) self.create_reference()
def do(srcs,tgt): pinfo = {"category" : "cache", "source" : None, "step" : "rebuild", "description" : ""} config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt)) for src in srcs: # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38) if "." in src: src = src.split(".")[1] config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread(pinfo, partial(rebuild,col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild,col)) yield from job
def do(srcs, tgt): pinfo = { "category": "cache", "source": None, "step": "rebuild", "description": "" } config.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs, tgt)) for src in srcs: config.logger.info("Rebuilding cache for source '%s'" % src) col = mongo.get_src_db()[src] pinfo["source"] = src job = yield from job_manager.defer_to_thread( pinfo, partial(rebuild, col)) yield from job config.logger.info("Done rebuilding cache for source '%s'" % src) if tgt: config.logger.info("Rebuilding cache for target '%s'" % tgt) col = mongo.get_target_db()[tgt] pinfo["source"] = tgt job = job_manager.defer_to_thread(pinfo, partial(rebuild, col)) yield from job
def export_ids(col_name): """ Export all _ids from collection named col_name. If col_name refers to a build where a cold_collection is defined, will also extract _ids and sort/uniq them to have the full list of _ids of the actual merged (cold+hot) collection Output file is stored in DATA_EXPORT_FOLDER/ids, defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is returned as the end, if successful. """ # prepare output directory DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None) if not DATA_EXPORT_FOLDER: DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export") ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids") if not os.path.exists(ids_export_folder): logging.debug("Creating export/ids folder: %s" % ids_export_folder) os.makedirs(ids_export_folder) build = get_src_build().find_one({"_id":col_name}) cold = None if build: col = get_target_db()[col_name] if build.get("build_config",{}).get("cold_collection"): cold_name = build["build_config"]["cold_collection"] cold = get_target_db()[cold_name] logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name)) else: # it's a src col = get_src_db()[col_name] # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore, # so we're sure to work with latest data. If cache is valid, this will be pretty fast logging.info("Screening _ids in collection '%s'" % col.name) for _id in id_feeder(col,validate_only=True): pass # now accessing cache col_ids_cache = get_cache_filename(col.name) assert os.path.exists(col_ids_cache) logging.info("Now using cache file %s" % col_ids_cache) if cold: logging.info("Screening _ids in cold collection '%s'" % cold.name) for _id in id_feeder(cold,validate_only=True): pass # now accessing cache cold_ids_cache = get_cache_filename(cold.name) assert os.path.exists(cold_ids_cache) logging.info("Now using cache file %s" % cold_ids_cache) outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name) # NOTE: can't use anyfile to open cache files and send _id through pipes # because it would load _id in memory (unless using hacks) so use cat (and # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe # on the shell if cold: fout = anyfile(outfn,"wb") colext = os.path.splitext(col_ids_cache)[1] coldext = os.path.splitext(cold_ids_cache)[1] assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext) comp = colext.replace(".","") supportedcomps = ["xz","gz",""] # no compression allowed as well assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps) # IDs sent to pipe's input (sort) then compress it (xz) pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE) psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True) pcat.stdout.close() # will raise end of pipe error when finished if comp: pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout) else: # just print stdin to stdout pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout) psort.stdout.close() try: logging.info("Running pipe to compute list of unique _ids") (out,err) = pcomp.communicate() # run the pipe! (blocking) if err: raise Exception(err) except Exception as e: logging.error("Error while running pipe to export _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise else: logging.info("Copying cache _id file") try: shutil.copyfile(col_ids_cache,outfn) except Exception as e: logging.error("Error while exporting _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise logging.info("Done exporting _ids to '%s'" % outfn) return outfn
def __init__(self, db, dest_col_name, logger=logging): db = db or get_src_db() self.temp_collection = db[dest_col_name] self.logger = logger
def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs): """ Guess what's inside 'db_col_names' and return the corresponding backend. - It could be a string (will first check for an src_build doc to check a backend_url field, if nothing there, will lookup a mongo collection in target database) - or a tuple("target|src","col_name") - or a ("mongodb://*****:*****@host","db","col_name") URI. - or a ("es_host:port","index_name","doc_type") If name_only is true, just return the name uniquely identifying the collection or index URI connection. """ col = None db = None is_mongo = True if type(db_col_names) == str: # first check build doc, if there's backend_url key, we'll use it instead of # direclty using db_col_names as target collection (see LinkDataBuilder) bdoc = get_src_build().find_one({"_id": db_col_names}) if follow_ref and bdoc and bdoc.get( "backend_url") and bdoc["backend_url"] != db_col_names: return create_backend(bdoc["backend_url"], name_only=name_only, follow_ref=follow_ref, **kwargs) else: db = mongo.get_target_db() col = db[db_col_names] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif db_col_names[0].startswith("mongodb://"): assert len( db_col_names ) == 3, "Missing connection information for %s" % repr(db_col_names) conn = mongo.MongoClient(db_col_names[0]) db = conn[db_col_names[1]] col = db[db_col_names[2]] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif len(db_col_names) == 3 and ":" in db_col_names[0]: is_mongo = False idxr = ESIndexer(index=db_col_names[1], doc_type=db_col_names[2], es_host=db_col_names[0], **kwargs) db = idxr col = db_col_names[1] else: assert len( db_col_names ) == 2, "Missing connection information for %s" % repr(db_col_names) db = db_col_names[0] == "target" and mongo.get_target_db( ) or mongo.get_src_db() col = db[db_col_names[1]] # normalize params (0:host, 1:port) db_col_names = [ "%s:%s" % (db.client.address[0], db.client.address[1]), db.name, col.name ] assert col is not None, "Could not create collection object from %s" % repr( db_col_names) if name_only: if is_mongo: return "mongo_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: return "es_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: if is_mongo: return DocMongoBackend(db, col) else: return DocESBackend(db)
def __init__(self): self._src = get_src_db()
def __init__(self, db_info, dest_col_name, logger): db = get_src_db() self.temp_collection = db[dest_col_name] self.logger = logger
def load(self): if self.cache is None: col = mongo.get_src_db()[TaxonomyNodesUploader.name] self.cache = {} [self.cache.setdefault(d["taxid"],d["parent_taxid"]) for d in col.find({},{"parent_taxid":1,"taxid":1})]
def setUp(self): """ Setup the mongodb structure for the tests :return: """ # Collections for the first test self.db = mongo.get_src_db() self.db.create_collection('a') self.db.create_collection('b') self.db.create_collection('c') self.db.create_collection('d') self.db.create_collection('e') self.db['b'].insert({'b_id': 'b:1234', 'a_id': 'a:1234'}) self.db['c'].insert({ 'c_id': 'c:1234', 'b_id': 'b:1234', 'e_id': 'e:1234' }) self.db['d'].insert({'d_id': 'd:1234', 'c_id': 'c:1234'}) self.db['e'].insert({'e_id': 'e:1234', 'd_id': 'd:1234'}) # Collections for the second test (one2many) self.db.create_collection('aa') self.db.create_collection('bb') self.db.create_collection('cc') self.db['bb'].insert({'b_id': 'b:1234', 'a_id': 'a:1234'}) self.db['bb'].insert({'b_id': 'b:5678', 'a_id': 'a:1234'}) self.db['cc'].insert({'c_id': 'c:1234', 'b_id': 'b:1234'}) self.db['cc'].insert({'c_id': 'c:01', 'b_id': 'b:5678'}) self.db['cc'].insert({'c_id': 'c:02', 'b_id': 'b:5678'}) # Collections for the path weight test self.db = mongo.get_src_db() self.db.create_collection('aaa') self.db.create_collection('bbb') self.db.create_collection('ccc') self.db.create_collection('ddd') self.db.create_collection('eee') self.db['bbb'].insert({ 'b_id': 'b:1234', 'a_id': 'a:1234', 'e_id': 'e:5678' }) self.db['ccc'].insert({'c_id': 'c:1234', 'b_id': 'b:1234'}) self.db['ddd'].insert({'d_id': 'd:1234', 'c_id': 'c:1234'}) self.db['eee'].insert({'e_id': 'e:1234', 'd_id': 'd:1234'}) # Collections for the mix mongodb and api test self.db = mongo.get_src_db() self.db.create_collection('mix1') self.db.create_collection('mix3') self.db['mix1'].insert({ 'ensembl': 'ENSG00000123374', 'start_id': 'start1' }) self.db['mix3'].insert({'end_id': 'end1', 'entrez': '1017'}) # Collections for lookup failure self.db['b'].insert({'b_id': 'b:f1', 'a_id': 'a:f1'}) self.db['c'].insert({'c_id': 'c:f1', 'b_id': 'b:f1'}) self.db['d'].insert({'d_id': 'd:fail1', 'c_id': 'c:f1'}) self.db['e'].insert({'e_id': 'e:f1', 'd_id': 'd:f1'})
def __init__(self): self._src = get_src_db()
def load_data(self, data_folder): drugbank_col = get_src_db()["drugbank"] assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \ "conversion). Please run 'drugbank' uploader first" return load_data(data_folder, drugbank_col)
def load(self): if self.cache is None: # this is a whole dict containing all taxonomu _ids col = mongo.get_src_db()[GeneInfoUploader.name] self.cache = [d["_id"] for d in col.find({},{"_id":1})]
def load(self): if self.cache is None: # this is a whole dict containing all taxonomu _ids col = mongo.get_src_db()[GeneInfoUploader.name] self.cache = [d["_id"] for d in col.find({},{"_id":1})]