class DrugBankUploader(BaseDrugUploader): """ DrugBankUploader - biothings uploader class for DrugBank """ name = "drugbank" storage_class = storage.IgnoreDuplicatedStorage __metadata__ = {"src_meta": SRC_META} # See the comment on the ExcludeFieldsById for use of this class. exclude_fields = ExcludeFieldsById(exclusion_ids, [ "drugbank.drug_interactions", "drugbank.products", "drugbank.mixtures" ]) keylookup = MyChemKeyLookup( [ ("inchikey", "drugbank.inchi_key"), ("drugbank", "drugbank.id"), # the following keys could possible be used to lookup 'inchikey' or 'unii' ("chebi", "drugbank.xrefs.chebi"), ("chembl", "drugbank.xrefs.chembl"), ("pubchem", "drugbank.xrefs.pubchem.cid"), ("inchi", "drugbank.inchi"), ("drugname", "drugbank.name"), # can be used to lookup unii, disabled for now ], copy_from_doc=True) def load_data(self, data_folder): """load_data from data source""" xmlfiles = glob.glob(os.path.join(data_folder, "*.xml")) if not xmlfiles: self.logger.info("Unzipping drugbank archive") unzipall(data_folder) self.logger.info("Load data from '%s'" % data_folder) xmlfiles = glob.glob(os.path.join(data_folder, "*.xml")) assert len( xmlfiles) == 1, "Expecting one xml file, got %s" % repr(xmlfiles) input_file = xmlfiles.pop() assert os.path.exists( input_file), "Can't find input file '%s'" % input_file return self.exclude_fields(self.keylookup(load_data, debug=True))(input_file) def post_update_data(self, *args, **kwargs): # pylint: disable=W0613 """create indexes following upload""" for idxname in ["drugbank.id", "drugbank.chebi", "drugbank.inchi"]: self.logger.info("Indexing '%s'" % idxname) # background=true or it'll lock the whole database... self.collection.create_index([(idxname, pymongo.HASHED)], background=True) # hashed index won"t support arrays, values are small enough to standard self.collection.create_index("drugbank.products.ndc_product_code") @classmethod def get_mapping(cls): """return mapping information for drugbank""" return drugbank_mapping
class NDCUploader(BaseDrugUploader): """ NDCUploader - Biothings Uploader class for NDC """ name = "ndc" storage_class = (storage.RootKeyMergerStorage, storage.CheckSizeStorage) __metadata__ = {"src_meta" : SRC_META} keylookup = MyChemKeyLookup( [("ndc", "ndc.productndc"), ("drugname", "ndc.nonproprietaryname")]) # See the comment on the ExcludeFieldsById for use of this class. exclude_fields = ExcludeFieldsById(exclusion_ids, ["ndc"]) def load_data(self, data_folder): """load data from the data source""" return self.exclude_fields(self.keylookup(load_data))(data_folder) @classmethod def get_mapping(cls): """return mapping data for the class""" mapping = { "ndc" : { "properties" : { "product_id" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "productndc" : { "type" : "text" }, "producttypename" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "proprietaryname" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "proprietarynamesuffix" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "nonproprietaryname" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "dosageformname" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "routename" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "startmarketingdate" : { "type" : "text" }, "endmarketingdate" : { "type" : "text" }, "marketingcategoryname" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "applicationnumber" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "labelername" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "substancename" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", "copy_to": ["all"] }, "active_numerator_strength" : { "type" : "text" }, "active_ingred_unit" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pharm_classes" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "deaschedule" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "package" : { "properties" : { "packagedescription" : { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ndcpackagecode" : { "type" : "text" } } } } } } return mapping
class ChebiUploader(BaseDrugUploader): name = "chebi" #storage_class = storage.IgnoreDuplicatedStorage storage_class = storage.RootKeyMergerStorage __metadata__ = {"src_meta": SRC_META} keylookup = MyChemKeyLookup([ ('inchikey', 'chebi.inchikey'), ('drugbank', 'chebi.xrefs.drugbank'), ('chebi', 'chebi.id'), ], copy_from_doc=True) # See the comment on the ExcludeFieldsById for use of this class. exclude_fields = ExcludeFieldsById(exclusion_ids, [ "chebi.xrefs.intenz", "chebi.xrefs.rhea", "chebi.xrefs.uniprot", "chebi.xrefs.sabio_rk", "chebi.xrefs.patent", ]) def load_data(self, data_folder): self.logger.info("Load data from '%s'" % data_folder) input_file = os.path.join(data_folder, "ChEBI_complete.sdf") # get others source collection for inchi key conversion drugbank_col = get_src_db()["drugbank"] assert drugbank_col.count() > 0, "'drugbank' collection is empty (required for inchikey " + \ "conversion). Please run 'drugbank' uploader first" chembl_col = get_src_db()["chembl"] assert chembl_col.count() > 0, "'chembl' collection is empty (required for inchikey " + \ "conversion). Please run 'chembl' uploader first" assert os.path.exists( input_file), "Can't find input file '%s'" % input_file # KeyLookup is disabled due to duplicate key errors # return self.exclude_fields(self.keylookup(load_data, debug=True))(input_file) return self.exclude_fields(load_data)(input_file) def post_update_data(self, *args, **kwargs): for idxname in ["chebi.id"]: self.logger.info("Indexing '%s'" % idxname) # background=true or it'll lock the whole database... self.collection.create_index([(idxname, pymongo.ASCENDING)], background=True) @classmethod def get_mapping(klass): mapping = { "chebi": { "properties": { "brand_names": { "type": "text", 'copy_to': ['all'], }, "id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", 'copy_to': ['all'], }, "iupac": { "type": "text" }, "inchi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "definition": { "type": "text" }, "star": { "type": "integer" }, "smiles": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "last_modified": { "type": "text" }, "inn": { "type": "text" }, "xrefs": { "properties": { "molbase": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "resid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "come": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pubchem": { "properties": { "sid": { "type": "integer" }, "cid": { "type": "integer" } } }, "beilstein": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "wikipedia": { "properties": { "url_stub": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "metacyc": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "biomodels": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "reactome": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "um_bbd_compid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "lincs": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "uniprot": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "sabio_rk": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "patent": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pdbechem": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "arrayexpress": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "cas": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "lipid_maps_class": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "kegg_drug": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "knapsack": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "lipid_maps_instance": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "intenz": { "type": "text" }, "kegg_glycan": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ecmdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "hmdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "kegg_compound": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ymdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drugbank": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "rhea": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "gmelin": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "intact": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "monoisotopic_mass": { "type": "float" }, "mass": { "type": "float" }, "secondary_chebi_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", 'copy_to': ['all'], }, "formulae": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inchikey": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "name": { "type": "text", 'copy_to': ['all'], }, "charge": { "type": "integer" }, "synonyms": { "type": "text" }, "citation": { "properties": { "pubmed": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "agricola": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pmc": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "chinese_abstracts": { "type": "integer" }, "citexplore": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } } } } } return mapping
class ChemblUploader(BaseDrugUploader, ParallelizedSourceUploader): """ ChemblUploader - upload the Chembl data source """ name = "chembl" storage_class = storage.RootKeyMergerStorage __metadata__ = {"src_meta" : SRC_META} MOLECULE_PATTERN = "molecule.*.json" keylookup = MyChemKeyLookup( [("inchikey", "chembl.inchi_key"), ("inchi", "chembl.inchi"), ("chembl", "chembl.molecule_chembl_id"), ("chebi", "chembl.chebi_par_id"), ("drugcentral", "chembl.xrefs.drugcentral.id"), ("drugname", "chembl.pref_name")], # TODO: handle duplicate keys from pubchem # - we use RootKeyMergerStorage, but the num. duplicates # - is too high (>10000) # ("pubchem", "chembl.xrefs.pubchem.sid"), copy_from_doc=True) def jobs(self): """ this will generate arguments for self.load.data() method, allowing parallelization """ json_files = glob.glob(os.path.join(self.data_folder, self.__class__.MOLECULE_PATTERN)) return [(f,) for f in json_files] def load_data(self, input_file): """load data from an input file""" self.logger.info("Load data from file '%s'" % input_file) return self.keylookup(load_data, debug=True)(input_file) def post_update_data(self, *args, **kwargs): """create indexes following an update""" # pylint: disable=W0613 """ for idxname in ["chembl.chebi_par_id", "chembl.inchi", "chembl.molecule_chembl_id"]: self.logger.info("Indexing '%s'" % idxname) # background=true or it'll lock the whole database... self.collection.create_index(idxname, background=True) """ for idxname in ["chembl.chebi_par_id", "chembl.molecule_chembl_id"]: self.logger.info("Indexing '%s'" % idxname) # background=true or it'll lock the whole database... self.collection.create_index(idxname, background=True) @classmethod def get_mapping(cls): """return mapping data""" mapping = { "chembl": { "properties": { "biotherapeutic": { "properties": { "helm_notation": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "description": { "type": "text" }, "biocomponents": { "properties": { "organism": { "type": "text" }, "tax_id": { "type": "integer" }, "sequence": { "type": "text" }, "component_id": { "type": "integer" }, "description": { "type": "text" }, "component_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "molecule_chembl_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", 'copy_to': ['all'], } } }, "therapeutic_flag": { "type": "boolean" }, "usan_stem": { "type": "text" }, "molecule_chembl_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "molecule_properties": { "properties": { "heavy_atoms": { "type": "integer" }, "acd_most_bpka": { "type": "float" }, "mw_freebase": { "type": "float" }, "num_ro5_violations": { "type": "integer" }, "molecular_species": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "qed_weighted": { "type": "float" }, "ro3_pass": { "type": "boolean" }, "full_mwt": { "type": "float" }, "num_lipinski_ro5_violations": { "type": "integer" }, "rtb": { "type": "integer" }, "psa": { "type": "float" }, "alogp": { "type": "float" }, "hbd": { "type": "integer" }, "acd_most_apka": { "type": "float" }, "hbd_lipinski": { "type": "integer" }, "acd_logp": { "type": "float" }, "full_molformula": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "aromatic_rings": { "type": "integer" }, "hba_lipinski": { "type": "integer" }, "mw_monoisotopic": { "type": "float" }, "hba": { "type": "integer" }, "acd_logd": { "type": "float" } } }, "helm_notation": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "max_phase": { "type": "integer" }, "inorganic_flag": { "type": "integer" }, "usan_stem_definition": { "type": "text" }, "dosed_ingredient": { "type": "boolean" }, "chebi_par_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "withdrawn_reason": { "type": "text" }, "molecule_hierarchy": { "properties": { "parent_chembl_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "molecule_chembl_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "prodrug": { "type": "integer" }, "withdrawn_flag": { "type": "boolean" }, "usan_year": { "type": "integer" }, "parenteral": { "type": "boolean" }, "black_box_warning": { "type": "integer" }, "polymer_flag": { "type": "boolean" }, "molecule_synonyms": { "properties": { "molecule_synonym": { "type": "text" }, "synonyms": { "type": "text" }, "syn_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "atc_classifications": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "molecule_type": { "type": "text" }, "first_in_class": { "type": "integer" }, "inchi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "structure_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "withdrawn_class": { "type": "text" }, "inchi_key": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "topical": { "type": "boolean" }, "oral": { "type": "boolean" }, "xrefs": { "properties": { "drugcentral": { "properties": { "id": { "type": "integer" }, "name": { "type": "text" } } }, "tg-gates": { "properties": { "id": { "type": "integer" }, "name": { "type": "text" } } }, "wikipedia": { "properties": { "url_stub": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "dailymed": { "properties": { "name": { "type": "text" } } }, "pubchem": { "properties": { "sid": { "type": "integer" } } } } }, "chirality": { "type": "integer" }, "usan_substem": { "type": "text" }, "indication_class": { "type": "text" }, "withdrawn_country": { "type": "text" }, "withdrawn_year": { "type": "integer" }, "availability_type": { "type": "integer" }, "smiles": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "natural_product": { "type": "integer" }, "pref_name": { "type": "text", "copy_to": ["all"] }, "first_approval": { "type": "integer" } } } } return mapping
class NDCUploader(BaseDrugUploader): name = "ndc" storage_class = (storage.BasicStorage, storage.CheckSizeStorage) __metadata__ = {"src_meta": SRC_META} keylookup = MyChemKeyLookup([("ndc", "ndc.productndc")]) # See the comment on the ExcludeFieldsById for use of this class. exclude_fields = ExcludeFieldsById(exclusion_ids, ["ndc"]) def load_data(self, data_folder): docs = self.exclude_fields(self.keylookup(load_data))(data_folder) inchi_key = {} for doc in docs: # IK found, but other productndc could also match the same # IK so we keep them in a list if type(doc["ndc"]) == list: inchi_key.setdefault(doc["_id"], doc["ndc"]) else: if not doc["ndc"] in inchi_key.setdefault(doc["_id"], []): inchi_key.setdefault(doc["_id"], []).append(doc["ndc"]) l = [] for ik, ndc in inchi_key.items(): if len(ndc) == 1: ndc = ndc.pop() yield {"_id": ik, "ndc": ndc} @classmethod def get_mapping(klass): mapping = { "ndc": { "properties": { "product_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "productndc": { "type": "text" }, "producttypename": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "proprietaryname": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "proprietarynamesuffix": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "nonproprietaryname": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "dosageformname": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "routename": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "startmarketingdate": { "type": "text" }, "endmarketingdate": { "type": "text" }, "marketingcategoryname": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "applicationnumber": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "labelername": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "substancename": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", "copy_to": ["all"] }, "active_numerator_strength": { "type": "text" }, "active_ingred_unit": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pharm_classes": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "deaschedule": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "package": { "properties": { "packagedescription": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ndcpackagecode": { "type": "text" } } } } } } return mapping
class UniiUploader(BaseDrugUploader): name = "unii" storage_class = storage.IgnoreDuplicatedStorage __metadata__ = {"src_meta": SRC_META} keylookup = MyChemKeyLookup( [('inchikey', 'unii.inchikey'), ('pubchem', 'unii.pubchem'), ('unii', 'unii.unii')], copy_from_doc=True, ) def load_data(self, data_folder): self.logger.info("Load data from '%s'" % data_folder) record_files = glob.glob(os.path.join(data_folder, "*Records*.txt")) assert len( record_files ) == 1, "Expecting one record.txt file, got %s" % repr(record_files) input_file = record_files.pop() assert os.path.exists( input_file), "Can't find input file '%s'" % input_file # disable keylookup - unii is a base collection used for drugname lookup # and should be loaded first, (keylookup commented out) # return self.keylookup(load_data)(input_file) return load_data(input_file) def post_update_data(self, *args, **kwargs): for field in ("unii.unii", "unii.preferred_term"): self.logger.info("Indexing '%s'" % field) self.collection.create_index(field, background=True) @classmethod def get_mapping(klass): mapping = { "unii": { "properties": { "unii": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", 'copy_to': ['all'], }, "preferred_term": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "registry_number": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ec": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ncit": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "rxcui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "itis": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ncbi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "plants": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "grin": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inn_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "molecular_formula": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inchikey": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "smiles": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "unii_type": { "type": "text" }, } } } return mapping
class AeolusUploader(BaseDrugUploader): storage_class = storage.RootKeyMergerStorage name = "aeolus" __metadata__ = { "src_meta": { "url": "http://www.nature.com/articles/sdata201626", "license_url": "http://datadryad.org/resource/doi:10.5061/dryad.8q0s4", "license_url_short": "http://bit.ly/2DIxWwF", "license": "CC0 1.0" } } keylookup = MyChemKeyLookup([('inchikey', 'aeolus.inchikey'), ('unii', 'aeolus.unii'), ('drugname', 'aeolus.drug_name')], copy_from_doc=True) def load_data(self, data_folder): # read data from the source collection src_col = self.db[self.src_col_name] def load_data(): yield from src_col.find() # perform keylookup on source collection return self.keylookup(load_data, debug=True)() @classmethod def get_mapping(klass): mapping = { "aeolus": { "properties": { "drug_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drug_name": { "type": "text", "copy_to": ["all"] }, "inchikey": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "no_of_outcomes": { "type": "integer", }, "pt": { "type": "text", }, "unii": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drug_vocab": { "type": "text" }, "drug_code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "rxcui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "relationships": { "properties": { "relatedSubstance": { "properties": { "approvalID": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "refPname": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "outcomes": { "properties": { "meddra_code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "case_count": { "type": "long" }, "id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword" }, "name": { "type": "text" }, "prr": { "type": "float" }, "prr_95_ci": { "type": "float" }, "ror": { "type": "float" }, "ror_95_ci": { "type": "float" } } } } } } return mapping
class PharmGkbUploader(BaseDrugUploader): """ PharmGKB Uploader Class """ name = "pharmgkb" storage_class = storage.RootKeyMergerStorage __metadata__ = {"src_meta": SRC_META} keylookup = MyChemKeyLookup([('inchi', 'pharmgkb.inchi'), ('pubchem', 'pharmgkb.xrefs.pubchem.cid'), ('drugbank', 'pharmgkb.xrefs.drugbank'), ('chebi', 'pharmgkb.xrefs.chebi')]) def load_data(self, data_folder): """load_data method""" self.logger.info("Load data from '%s'" % data_folder) input_file = os.path.join(data_folder, "drugs.tsv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file return self.keylookup(load_data)(input_file) def post_update_data(self, *args, **kwargs): field = "pharmgkb.id" self.logger.info("Indexing '%s'" % field) self.collection.create_index(field, background=True) @classmethod def get_mapping(cls): """get mapping information""" mapping = { "pharmgkb": { "properties": { "id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", 'copy_to': ['all'], }, "dosing_guideline": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inchi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "name": { "type": "text", 'copy_to': ['all'], }, "smiles": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "generic_names": { "type": "text", 'copy_to': ['all'], }, "brand_mixtures": { "type": "text" }, "trade_names": { "type": "text" }, "type": { "type": "text" }, "xrefs": { "properties": { "web_resource": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "uniprotkb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pubchem": { "properties": { "sid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "cid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "het": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "wikipedia": { "properties": { "url_stub": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "iuphar_ligand": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "meddra": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "atc": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "kegg_compound": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "umls": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "clinicaltrials": { "properties": { "gov": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "genbank": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "rxnorm": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "chebi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "cas": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ttd": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "kegg_drug": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "mesh": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ndc": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "chemspider": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "hmdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "dailymed": { "properties": { "setid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "ndfrt": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "bindingdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drugbank": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pdb": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "dpd": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } } } } } return mapping
class SiderUploader(BaseDrugUploader): name = "sider" #storage_class = storage.IgnoreDuplicatedStorage __metadata__ = {"src_meta": SRC_META} keylookup = MyChemKeyLookup([("pubchem", "_id")], idstruct_class=SiderIDStruct) max_lst_size = 2000 def load_data(self, data_folder): input_file = os.path.join(data_folder, "merged_freq_all_se_indications.tsv") self.logger.info("Load data from file '%s'" % input_file) docs = self.keylookup(load_data)(input_file) for doc in docs: # sort the 'sider' list by "sider.side_effect.frequency" and "sider.side_effect.name" doc['sider'] = sorted(doc['sider'], key=lambda x: sort_key(x)) # take at most self.max_lst_size elements from the 'sider' field # See the 'truncated_docs.tsv' file for a list of ids that are affected if len(doc['sider']) > self.max_lst_size: doc['sider'] = doc['sider'][:self.max_lst_size] yield doc @classmethod def get_mapping(klass): mapping = { "sider": { "properties": { "stitch": { "properties": { "flat": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "stereo": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "indication": { "properties": { "method_of_detection": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "name": { "type": "text" } } }, "meddra": { "properties": { "type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "umls_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "side_effect": { "properties": { "frequency": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "placebo": { "type": "boolean" }, "name": { "type": "text" } } } } } } return mapping
class DrugCentralUploader(BaseDrugUploader): name = "drugcentral" __metadata__ = { "src_meta": { "url": "http://drugcentral.org/", "license_url": "http://drugcentral.org/privacy", "license_url_short": "http://bit.ly/2SeEhUy", "license": "CC BY-SA 4.0", } } keylookup = MyChemKeyLookup( [ ('inchikey', 'drugcentral.structures.inchikey'), ('unii', 'drugcentral.xref.unii'), # other keys are present but not currently used by keylookup ('inchi', 'drugcentral.structures.inchi'), ('drugbank', 'drugcentral.xrefs.drugbank_id'), ('chebi', 'drugcentral.xrefs.chebi'), ('chembl', 'drugcentral.xrefs.chembl_id'), ('pubchem', 'drugcentral.xrefs.pubchem_cid') ], # ('drugname', 'drugcentral.synonyms')], # unhashable type - list copy_from_doc=True, ) def load_data(self, data_folder): # read data from the source collection src_col = self.db[self.src_col_name] def load_data(): yield from src_col.find() # perform keylookup on source collection return self.keylookup(load_data)() @classmethod def get_mapping(klass): mapping = { "drugcentral": { "properties": { "structures": { "properties": { "smiles": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "cas_rn": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inn": { "type": "text", 'copy_to': ['all'], }, "inchi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inchikey": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "fda_adverse_event": { "properties": { "llr": { "type": "float" }, "meddra_term": { "type": "text" }, "llr_threshold": { "type": "float" }, "drug_ae": { "type": "integer" }, "level": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drug_no_ae": { "type": "integer" }, "no_drug_no_ar": { "type": "integer" }, "meddra_code": { "type": "integer" }, "no_drug_ae": { "type": "integer" } } }, "drug_dosage": { "properties": { "unit": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "route": { "type": "text" }, "dosage": { "type": "float" } } }, "pharmacology_class": { "properties": { "chebi": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "fda_epc": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "fda_pe": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "fda_chemical/ingredient": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "fda_moa": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "mesh_pa": { "properties": { "description": { "type": "text" }, "code": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } } } }, "approval": { "properties": { "agency": { "type": "text" }, "date": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "company": { "type": "text" }, "orphan": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } }, "drug_use": { "properties": { "reduce risk": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } }, "indication": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } }, "contraindication": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } }, "symptomatic treatment": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } }, "off-label use": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } }, "diagnosis": { "properties": { "snomed_full_name": { "type": "text" }, "cui_semantic_type": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "concept_name": { "type": "text" }, "umls_cui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomed_concept_id": { "type": "long" } } } } }, "bioactivity": { "properties": { "organism": { "type": "text" }, "target_class": { "type": "text" }, "action_type": { "type": "text" }, "moa": { "type": "float" }, "target_name": { "type": "text" }, "act_type": { "type": "text" }, "moa_source": { "type": "text" }, "uniprot": { "properties": { "uniprot_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "swissprot_entry": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "gene_symbol": { "type": "text" } } }, "act_source": { "type": "text" }, "act_value": { "type": "float" } } }, "synonyms": { "type": "text", 'copy_to': ['all'], }, "xrefs": { "properties": { "pubchem_cid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "nui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "nddf": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "pdb_chem_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "kegg_drug": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "secondary_cas_rn": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "vandf": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "ndfrt": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "chembl_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "drugbank_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "inn_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "mmsl": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "snomedct_us": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "mesh_supplemental_record_ui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "unii": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "umlscui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "chebi": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "mesh_descriptor_ui": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "vuid": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "iuphar_ligand_id": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "rxnorm": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", } } } } } } return mapping