def __init__(self, build_config=None, backend='mongodb'): self.src = get_src_db() self.step = 10000 self.use_parallel = False self.merge_logging = True # save output into a logging file when merge is called. self.max_build_status = 10 # max no. of records kept in "build" field of src_build collection. self.using_ipython_cluster = False self.shutdown_ipengines_after_done = False self.log_folder = LOG_FOLDER self._build_config = build_config self._entrez_geneid_d = None self._idmapping_d_cache = {} self.get_src_master() if backend == 'mongodb': self.target = databuild.backend.GeneDocMongoDBBackend() elif backend == 'es': self.target = databuild.backend.GeneDocESBackend(ESIndexer()) elif backend == 'couchdb': from config import COUCHDB_URL import couchdb self.target = databuild.backend.GeneDocCouchDBBackend(couchdb.Server(COUCHDB_URL)) elif backend == 'memory': self.target = databuild.backend.GeneDocMemeoryBackend() else: raise ValueError('Invalid backend "%s".' % backend)
def load_contig(contig): '''save cadd contig into mongodb collection. should be an iterable. ''' # if CADD_INPUT == "exome": # CADD_INPUT = exome tabix = pysam.Tabixfile(whole_genome) src_db = get_src_db() target_coll = src_db["cadd"] t0 = time.time() cnt = 0 docs = (doc for doc in fetch_generator(tabix, contig)) doc_list = [] for doc in docs: doc_list.append(doc) cnt += 1 if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded cadd chromosome %s into mongodb" % contig) print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer(es_host=es_host) self._esi._index = self._index self._src = get_src_db() self.step = step
def get_backend(target_name, bk_type, **kwargs): '''Return a backend instance for given target_name and backend type. currently support MongoDB and ES backend. ''' if bk_type == 'mongodb': target_db = get_src_db() target_col = target_db[target_name] return GeneDocMongoDBBackend(target_col) elif bk_type == 'es': esi = ESIndexer(target_name, **kwargs) return GeneDocESBackend(esi)
def get_discrepancy_id(output_file_name, mongo_collection, uri): src = get_src_db() context = load_context('mygene.info') data = src[mongo_collection].find() with open(output_file_name, 'w') as f: for _doc in data: _doc.update(context) jsonld_doc = nquads_transform(_doc) rsid = fetch_value_by_uri(jsonld_doc, uri) if type(rsid) == list: f.write(_doc['_id'] + "\n")
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set': { "unmatched_ref": "True" }}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True): '''save src data into mongodb collection. if src_module is provided, src_data = src_module.load_data() if new_collection is True, it requires the target collection is empty. else, use src_data directly, should be a iterable. ''' src_db = get_src_db() target_coll = src_db[collection_name] if new_collection and target_coll.count() > 0: print("Error: target collection {} exists.".format(collection_name)) return t0 = time.time() cnt = 0 if src_module: src_data = src_module.load_data() if src_data: doc_list = [] for doc in src_data: cnt += 1 if not inbatch: target_coll.insert(doc, manipulate=False, check_keys=False, w=0) else: doc_list.append(doc) if len(doc_list) == 100: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded %s into mongodb" % collection_name) print("total docs: {}; total time: {}".format(cnt, timesofar(t0))) else: print("Error: no src data to load.")
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True, step=1000): '''save src data into mongodb collection. if src_module is provided, src_data = src_module.load_data() if new_collection is True, it requires the target collection is empty. else, use src_data directly, should be a iterable. ''' src_db = get_src_db() target_coll = src_db[collection_name] if new_collection and target_coll.count() > 0: print("Error: target collection {} exists.".format(collection_name)) return t0 = time.time() cnt = 0 if src_module: src_data = src_module.load_data() if src_data: doc_list = [] for doc in src_data: cnt += 1 if not inbatch: try: target_coll.insert_one(doc) except: print('One duplicate id exists, id is {}'.format(doc['_id'])) continue else: doc_list.append(doc) if len(doc_list) == step: target_coll.insert_many(doc_list) doc_list = [] if cnt % 100000 == 0: print(cnt, timesofar(t0)) if doc_list: target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0) print("successfully loaded %s into mongodb" % collection_name) print("total docs: {}; total time: {}".format(cnt, timesofar(t0))) else: print("Error: no src data to load.")
def __init__(self, src_module): self.src_module = src_module self.src_name = src_module.__METADATA__['src_name'] self.src_db = get_src_db() self.temp_collection = None
def __init__(self): self._src = get_src_db()