def load(self): if self.map is None: # this is a whole dict containing all entrez _id, wether it's a current or retired one. # it means most of the data has assoction with same _id as key and as value. It consumes memory # but it's a way to know the entrez perimeter (what entrez _ids exist and should be considered self.map = loadobj( ("entrez_gene__geneid_d.pyobj", self.db_provider()), mode='gridfs')
def load_chr_data(self): self.logger.info("\tLoading chromosome data from '%s'..." % self.genome) try: self._chr_data = loadobj(self.genome) except Exception as e: self.logger.info(e) raise self.logger.info("Done.")
def load_chr_data(self): self.logger.info("\tLoading chromosome data from '%s'..." % self.genome) try: self._chr_data = loadobj(self.genome) except Exception as e: self.logger.info(e) raise self.logger.info("Done.")
def main(self, diff_filepath, merge_collection, field): diff = loadobj(diff_filepath) source_collection = diff['source'] add_ids = diff['add'] delete_ids = diff['delete'] update_ids = [_doc['_id'] for _doc in diff['update']] self.add_update(source_collection, merge_collection, add_ids) self.add_update(source_collection, merge_collection, update_ids) self.delete(merge_collection, field, delete_ids)
def main(self, diff_filepath, merge_collection, field): diff = loadobj(diff_filepath) source_collection = diff['source'] add_ids = diff['add'] delete_ids = diff['delete'] update_ids = [_doc['_id'] for _doc in diff['update']] self.add_update(source_collection, merge_collection, add_ids) self.add_update(source_collection, merge_collection, update_ids) self.delete(merge_collection, field, delete_ids)
def update_mapping(): diffm = os.path.join(diff_folder, diff_mapping_file) ops = loadobj(diffm) mapping = indexer.get_mapping() # we should have the same doc type declared in the mapping mapping[doc_type]["properties"] = jsonpatch.apply_patch( mapping[doc_type]["properties"], ops) res = indexer.update_mapping(mapping) return res
def restore(db, archive, drop=False): """Restore database from given archive. If drop is True, then delete existing collections""" data = loadobj(archive) for colname in data: docs = data[colname] col = b[colname] if drop: # we don't have a drop command but we can remove all docs col.remove({}) for doc in docs: col.save(doc)
def analyze(diff_file, detailed): data = loadobj(diff_file) sources[data["source"]] = 1 if detailed: # TODO: if self-contained, no db connection needed new_col = create_backend(metadata["new"]["backend"]) old_col = create_backend(metadata["old"]["backend"]) if len(adds) < max_reported_ids: if detailed: # look for which root keys were added in new collection for _id in data["add"]: # selfcontained = dict for whole doc (see TODO above) if type(_id) == dict: _id = _id["_id"] doc = new_col.get_from_id(_id) rkeys = sorted(doc.keys()) adds["ids"].append([_id, rkeys]) else: if data["add"] and type(data["add"][0]) == dict: adds["ids"].extend([d["_id"] for d in data["add"]]) else: adds["ids"].extend(data["add"]) adds["count"] += len(data["add"]) if len(dels) < max_reported_ids: if detailed: # look for which root keys were deleted in old collection for _id in data["delete"]: doc = old_col.get_from_id(_id) rkeys = sorted(doc.keys()) dels["ids"].append([_id, rkeys]) else: dels["ids"].extend(data["delete"]) dels["count"] += len(data["delete"]) for up in data["update"]: for patch in up["patch"]: update_details[patch["op"]].setdefault( patch["path"], { "count": 0, "ids": [] }) if len(update_details[patch["op"]][patch["path"]] ["ids"]) < max_reported_ids: update_details[patch["op"]][ patch["path"]]["ids"].append(up["_id"]) update_details[patch["op"]][patch["path"]]["count"] += 1 update_details["count"] += len(data["update"]) assert len( sources ) == 1, "Should have one datasource from diff files, got: %s" % [ s for s in sources ]
def load(self): if self.map is None: self.retired2current.load() self.map = {} ensembl2entrez_li = loadobj( ("ensembl_gene__2entrezgene_list.pyobj", self.db_provider()), mode='gridfs') #filter out those deprecated entrez gene ids for ensembl_id, entrez_id in ensembl2entrez_li: entrez_id = int(entrez_id) if entrez_id in self.retired2current: self.map[ensembl_id] = self.retired2current.translate( entrez_id)
def restore(archive, drop=False): """Restore database from given archive. If drop is True, then delete existing collections""" data = loadobj(archive) # use src_dump collection which always exists to get the database object db = get_src_dump().database for colname in data: docs = data[colname] col = db[colname] if drop: # we don't have a drop command but we can remove all docs col.remove({}) for doc in docs: col.save(doc)
def load_data(self, data_folder): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids_file = os.path.join(data_folder, "../ref_microbe_taxids.pyobj") datafile = os.path.join(data_folder, 'gene2refseq.gz') taxids = loadobj(taxids_file) taxid_set = set(taxids) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(datafile, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'entrezgene': _id, 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id
def mapping_diffed(f): res = f.result() if res.get("mapping_file"): nonlocal got_error # check mapping differences: only "add" ops are allowed, as any others actions would be # ingored by ES once applied (you can't update/delete elements of an existing mapping) mf = os.path.join(diff_folder, res["mapping_file"]["name"]) ops = loadobj(mf) for op in ops: if op["op"] != "add": err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \ " only 'add' operations are allowed. You can still produce the " + \ "diff by removing 'mapping' from 'steps' arguments. " + \ "Ex: steps=['count','content']. Diff operation was: %s" % op) got_error = err metadata["diff"]["mapping_file"] = mf diff_stats["mapping_changed"] = True self.logger.info( "Diff file containing mapping differences generated: %s" % res.get("mapping_file"))
def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids = loadobj(TAXIDS_FILE) taxid_set = set(taxids) load_start(DATAFILE) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id load_done('[%d]' % count)
def reset_synced(self, diff_folder, backend=None): """ Remove "synced" flag from any pyobj file in diff_folder """ diff_files = glob.glob(os.path.join(diff_folder, "*.pyobj")) for diff in diff_files: pyobj = loadobj(diff) try: if pyobj.get("synced"): if backend: self.logger.info( "Removing synced flag from '%s' for backend '%s'" % (diff, backend)) pyobj["synced"].pop(backend, None) else: self.logger.info("Removing synced flag from '%s'" % diff) pyobj.pop("synced") dump(pyobj, diff) except AttributeError: # pyobj not a dict continue
def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids = loadobj(TAXIDS_FILE) taxid_set = set(taxids) load_start(DATAFILE) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(DATAFILE, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id load_done('[%d]' % count)
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new mongo collection and an elasticsearch index""" new = create_backend(new_db_col_names) # mongo collection to sync from indexer = create_backend(es_config).target_esidxer diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("es") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: try: # force action=create to spot docs already added indexer.index(doc, doc["_id"], action="create") res["added"] += 1 except ConflictError: # already added res["skipped"] += 1 continue except Exception as e: errors.append({ "_id": doc["_id"], "file": diff_file, "error": e }) import pickle pickle.dump(errors, open("errors", "wb")) raise # update: get doc from indexer and apply diff batch = [] ids = [p["_id"] for p in diff["update"]] for i, doc in enumerate(indexer.get_docs(ids)): try: patch_info = diff["update"][ i] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"] # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("es", True) dump(diff, diff_file) return res
def sync_es_coldhot_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res eskwargs = {} # pass optional ES Indexer args if hasattr(btconfig, "ES_TIMEOUT"): eskwargs["timeout"] = btconfig.ES_TIMEOUT if hasattr(btconfig, "ES_MAX_RETRY"): eskwargs["max_retries"] = btconfig.ES_MAX_RETRY if hasattr(btconfig, "ES_RETRY"): eskwargs["retry_on_timeout"] = btconfig.ES_RETRY logging.debug("Create ES backend with args: (%s,%s)" % (es_config, eskwargs)) bckend = create_backend(es_config, **eskwargs) indexer = bckend.target_esidxer diff = loadobj(diff_file) # add: diff between hot collections showed we have new documents but it's # possible some of those docs already exist in premerge/cold collection. # if so, they should be treated as dict.update() where the hot document content # has precedence over the cold content for fields in common if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: new = create_backend(new_db_col_names) # mongo collection to sync from assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # remove potenial existing _timestamp from document # (not allowed within an ES document (_source)) [d.pop("_timestamp", None) for d in docs] # check which docs already exist in existing index (meaning they exist in cold collection) dids = dict([(d["_id"], d) for d in docs]) dexistings = dict([ (d["_id"], d) for d in indexer.get_docs([k for k in dids.keys()]) ]) logging.debug("From current batch, %d already exist" % len(dexistings)) # remove existing docs from "add" so the rest of the dict will be treated # as "real" added documents while update existing ones with new content toremove = [] for _id, d in dexistings.items(): # update in-place if d == dids[d["_id"]]: logging.debug("%s was already added, skip it" % d["_id"]) toremove.append(d["_id"]) res["skipped"] += 1 else: newd = copy.deepcopy(d) d.update(dids[d["_id"]]) if d == newd: logging.debug("%s was already updated, skip it" % d["_id"]) toremove.append(d["_id"]) res["skipped"] += 1 dids.pop(d["_id"]) for _id in toremove: dexistings.pop(_id) logging.info("Syncing 'add' documents (%s in total) from cold/hot merge: " % len(docs) + "%d documents will be updated as they already exist in the index, " % len(dexistings) + "%d documents will be added (%d skipped as already processed)" % (len(dids), len(toremove))) # treat real "added" documents # Note: no need to check for "already exists" errors, as we already checked that before # in order to know what to do try: res["added"] += indexer.index_bulk(dids.values(), batch_size, action="create")[0] except BulkIndexError: logging.error("Error while adding documents %s" % [k for k in dids.keys()]) # update already existing docs in cold collection # treat real "added" documents try: res["updated"] += indexer.index_bulk(dexistings.values(), batch_size)[0] except BulkIndexError as e: logging.error( "Error while updating (via new hot detected docs) documents: %s" % e) # update: get doc from indexer and apply diff # note: it's the same process as for non-coldhot sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new mongo collection and an elasticsearch index""" res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res eskwargs = {} # pass optional ES Indexer args if hasattr(btconfig, "ES_TIMEOUT"): eskwargs["timeout"] = btconfig.ES_TIMEOUT if hasattr(btconfig, "ES_MAX_RETRY"): eskwargs["max_retries"] = btconfig.ES_MAX_RETRY if hasattr(btconfig, "ES_RETRY"): eskwargs["retry_on_timeout"] = btconfig.ES_RETRY logging.debug("Create ES backend with args: (%s,%s)" % (es_config, eskwargs)) bckend = create_backend(es_config, **eskwargs) indexer = bckend.target_esidxer diff = loadobj(diff_file) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: new = create_backend(new_db_col_names) # mongo collection to sync from assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # remove potenial existing _timestamp from document # (not allowed within an ES document (_source)) [d.pop("_timestamp", None) for d in docs] try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: _id = doc.pop("_id") try: # force action=create to spot docs already added indexer.index(doc, _id, action="create") res["added"] += 1 except ConflictError: # already added logging.warning("_id '%s' already added" % _id) res["skipped"] += 1 continue except Exception as e: errors.append({"_id": _id, "file": diff_file, "error": e}) import pickle pickle.dump(errors, open("errors", "wb")) raise except Exception as e: if debug: logging.error( "From diff file '%s', following IDs couldn't be synced because: %s\n%s" % (diff_file, e, [d.get("_id") for d in docs])) pickfile = "batch_%s_%s.pickle" % (cnt, os.path.basename(diff_file)) logging.error("Documents pickled in '%s'" % pickfile) pickle.dump(docs, open(pickfile, "wb")) raise # update: get doc from indexer and apply diff sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}, debug=False): """Worker to sync data between a new and an old mongo collection""" # check if diff files was already synced res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} synced_file = "%s.synced" % diff_file if os.path.exists(synced_file): logging.info("Diff file '%s' already synced, skip it" % os.path.basename(diff_file)) diff = loadobj(synced_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) # mark as synced os.rename(diff_file, synced_file) return res
def get_geneid_d(data_folder, species_li=None, load_cache=True, save_cache=True, only_for={}): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set( [TAXONOMY[species]["tax_id"] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(data_folder) # check cache file _cache_file = 'geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene_info.gz') and \ file_newer(_cache_file, 'gene_history.gz'): _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(data_folder, 'gene_info.gz') if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set and ( only_for and ld[1] in only_for) elif only_for: species_filter = lambda ld: only_for and ld[1] in only_for else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) DATAFILE = os.path.join(data_folder, 'gene_history.gz') if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # TODO: this fills memory with key==value ... for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def load_chr_data(self,genome_file): print("\tLoading chromosome data...", end='') self._chr_data = loadobj(genome_file) print("Done.")
def load_chr_data(self, genome_file): print("\tLoading chromosome data...", end='') self._chr_data = loadobj(genome_file) print("Done.")
def load_pdb(data_folder): pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj') data = loadobj(pdb_dumpfile) return data
def load_pir(data_folder): pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj') data = loadobj(pir_dumpfile) return data
def load_data(step=1000, offset=0, gwas_data_local=None): if gwas_data_local: gwas_data = loadobj('gwasdata.pyobj') for item in gwas_data: snp = item chrom = snp[1] chrom = chrom[3:] rsid = snp[4] pubMedID = snp[5] title = snp[9] trait = snp[10] region = snp[13] gene_name = snp[14] riskAllele = snp[15] riskAlleleFreq = snp[16] if not is_float(riskAlleleFreq): riskAlleleFreq = None pValue = snp[17] pValue_desc = snp[18] if not is_float(pValue): pValue = None pValue_desc = None # parse from myvariant.info to get hgvs_id, # ref, alt information based on rsid url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "gwassnp": { "rsid": rsid, "pubmed": pubMedID, "title": title, "trait": trait, "region": region, "genename": gene_name, "risk_allele": riskAllele, "risk_allele_freq": riskAlleleFreq, "pvalue": pValue, "pvalue_desc": pValue_desc } } yield one_snp_json else: MySQLHG19 = MySQLdb.connect('genome-mysql.cse.ucsc.edu', db='hg19', user='******', passwd='password') Cursor = MySQLHG19.cursor() # get the row number of gwasCatalog sql = "SELECT COUNT(*) FROM gwasCatalog" Cursor.execute(sql) numrows = Cursor.fetchone()[0] print(numrows) sql = "SELECT * FROM gwasCatalog" Cursor.execute(sql) for i in range(numrows): snp = Cursor.fetchone() if i and i % step == 0: print(i) chrom = snp[1] chrom = chrom[3:] rsid = snp[4] pubMedID = snp[5] title = snp[9] trait = snp[10] region = snp[13] gene_name = snp[14] riskAllele = snp[15] riskAlleleFreq = snp[16] if not is_float(riskAlleleFreq): riskAlleleFreq = None pValue = snp[17] pValue_desc = snp[18] if not is_float(pValue): pValue = None pValue_desc = None # parse from myvariant.info to get hgvs_id, ref, alt information based on rsid url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\ + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19' r = requests.get(url) for hits in r.json()['hits']: HGVS = hits['_id'] one_snp_json = { "_id": HGVS, "gwassnp": { "rsid": rsid, "pubmed": pubMedID, "title": title, "trait": trait, "region": region, "genename": gene_name, "risk_allele": riskAllele, "risk_allele_freq": riskAlleleFreq, "pvalue": pValue, "pvalue_desc": pValue_desc } } yield one_snp_json
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result