def parserIndications(databases_directory, drugMapping, phenotypeMapping, download=True): config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases') url = config['SIDER_indications'] header = config['indications_header'] output_file = 'sider_is_indicated_for.tsv' relationships = set() directory = os.path.join(databases_directory, "SIDER") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) associations = gzip.open(fileName, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") drug = re.sub(r'CID\d', 'CIDm', data[0]) se = data[1] evidence = data[2] if se.lower() in phenotypeMapping and drug in drugMapping: for d in drugMapping[drug]: p = phenotypeMapping[se.lower()] relationships.add((d, p, "IS_INDICATED_FOR", evidence, "SIDER", se)) associations.close() builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parsePairs(config, databases_directory, qtype, mapping, download=True): url = config['db_url'] ifile = config['db_files'][qtype] source = config['db_sources'][qtype] relationships = set() directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "integration")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "integration")) ifile = os.path.join(directory,os.path.join("integration", ifile)) with open(ifile, 'r') as idbf: for line in idbf: data = line.rstrip("\r\n").split('\t') id1 = "9606."+data[0] id2 = data[2] score = float(data[4]) if id1 in mapping: for ident in mapping[id1]: relationships.add((ident, id2, "ASSOCIATED_WITH_INTEGRATED", source, score, "compiled")) else: continue return relationships
def parser(databases_directory, drug_source, download=True): config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases') url = config['SIDER_url'] header = config['header'] output_file = 'sider_has_side_effect.tsv' drugmapping = mp.getSTRINGMapping(source=drug_source, download=download, db="STITCH") phenotypemapping = mp.getMappingFromOntology(ontology="Phenotype", source=config['SIDER_source']) relationships = set() directory = os.path.join(databases_directory, "SIDER") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) associations = gzip.open(fileName, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") drug = re.sub(r'CID\d', 'CIDm', data[0]) se = data[2] evidence_from = str(data[3]) #freq = data[4] #lower_bound = data[5] #upper_bound = data[6] if se.lower() in phenotypemapping and drug in drugmapping: for d in drugmapping[drug]: p = phenotypemapping[se.lower()] relationships.add((d, p, "HAS_SIDE_EFFECT", "SIDER", se, evidence_from)) associations.close() return (relationships, header, output_file, drugmapping, phenotypemapping)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="signorConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "SIGNOR") builder_utils.checkDirectory(directory) url = config['url'] modifications = config['modifications'] amino_acids = config['amino_acids'] accronyms = config['accronyms'] entities_header = config['entities_header'] relationships_headers = config['rel_headers'] entities = set() relationships = defaultdict(set) filename = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) entities, relationships = parse_substrates(filename, modifications, accronyms, amino_acids) return entities, relationships, entities_header, relationships_headers
def parseUniProtAnnotations(config, databases_directory, download=True): roots = { 'F': 'Molecular_function', 'C': 'Cellular_component', 'P': 'Biological_process' } url = config['uniprot_go_annotations'] relationships = defaultdict(set) directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) af = builder_utils.read_gzipped_file(fileName) for line in af: line = line if line.startswith('!'): continue data = line.rstrip("\r\n").split("\t") identifier = data[1] go = data[4] evidence = data[6] root = data[8] if root in roots: root = roots[root] relationships[(root, 'associated_with')].add( (identifier, go, "ASSOCIATED_WITH", evidence, 5, "UniProt")) return relationships
def parser(databases_directory, importDirectory, download=True): config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases') outputfileName = "Publications.tsv" url = config['db_url'] ifile = config['organisms_file'] organisms = str(config['organisms']) directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "textmining")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) valid_pubs = read_valid_pubs(organisms, ifile) entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs) num_entities = len(entities) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_entities(entities, header, outputfile) entities = None for qtype in config['db_mentions_types']: parse_mentions(config, directory, qtype, importDirectory, download) builder_utils.remove_directory(os.path.join(directory, "textmining")) return (num_entities, outputfile)
def parseUniProtPeptides(config, databases_directory, download=True): file_urls = config['uniprot_peptides_files'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) for url in file_urls: fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) first = True with open(fileName, 'r', encoding='utf-8') as f: for line in f: if first: first = False continue data = line.rstrip("\r\n").split("\t") peptide = data[0] accs = data[6].split(",") is_unique = True if len(accs) > 1: is_unique = False entities.add( (peptide, "Peptide", "tryptic peptide", is_unique)) for protein in accs: relationships[("Peptide", 'belongs_to_protein')].add( (peptide, protein, "BELONGS_TO_PROTEIN", "UniProt")) return entities, relationships
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="smpdbConfig.yml", data_type='databases') urls = config['smpdb_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "SMPDB") builder_utils.checkDirectory(directory) for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) zipped_file = os.path.join(directory, file_name) with zipfile.ZipFile(zipped_file) as rf: if dataset == "pathway": entities = parsePathways(config, rf) elif dataset == "protein": relationships.update(parsePathwayProteinRelationships(rf)) elif dataset == "metabolite": relationships.update( parsePathwayMetaboliteDrugRelationships(rf)) builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): relationships = defaultdict(set) config = builder_utils.get_config(config_name="disgenetConfig.yml", data_type='databases') files = config['disgenet_files'] mapping_files = config['disgenet_mapping_files'] url = config['disgenet_url'] directory = os.path.join(databases_directory, "disgenet") builder_utils.checkDirectory(directory) header = config['disgenet_header'] output_file = 'disgenet_associated_with.tsv' if download: for f in files: builder_utils.downloadDB(url + files[f], directory) for f in mapping_files: builder_utils.downloadDB(url + mapping_files[f], directory) proteinMapping = readDisGeNetProteinMapping(config, directory) diseaseMapping = readDisGeNetDiseaseMapping(config, directory) for f in files: first = True associations = gzip.open(os.path.join(directory, files[f]), 'r') dtype, atype = f.split('_') if dtype == 'gene': idType = "Protein" scorePos = 9 if dtype == 'variant': idType = "Transcript" scorePos = 5 for line in associations: if first: first = False continue try: data = line.decode('utf-8').rstrip("\r\n").split("\t") geneId = str(int(data[0])) #disease_specificity_index = data[2] #disease_pleiotropy_index = data[3] diseaseId = data[4] score = float(data[scorePos]) pmids = data[13] source = data[-1] if geneId in proteinMapping: for identifier in proteinMapping[geneId]: if diseaseId in diseaseMapping: for code in diseaseMapping[diseaseId]: code = "DOID:" + code relationships[idType].add( (identifier, code, "ASSOCIATED_WITH", score, atype, "DisGeNet: " + source, pmids)) except UnicodeDecodeError: continue associations.close() builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parse_mentions(config, directory, qtype, importDirectory, download=True): url = config['db_url'] ifile = config['db_mentions_files'][qtype] if qtype == "9606": mapping = mp.getSTRINGMapping(download=download) elif qtype == "-1": mapping = mp.getSTRINGMapping(source=config['db_sources']["Drug"], download=download, db="STITCH") filters = [] if qtype in config['db_mentions_filters']: filters = config['db_mentions_filters'][qtype] entity1, entity2 = config['db_mentions_types'][qtype] outputfile = os.path.join( importDirectory, entity1 + "_" + entity2 + "_mentioned_in_publication.tsv") if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) with open(outputfile, 'w') as f: f.write("START_ID\tEND_ID\tTYPE\n") with open(ifile, 'r') as idbf: for line in idbf: data = line.rstrip("\r\n").split('\t') id1 = data[0] pubmedids = data[1].split(" ") ident = [] if qtype == "9606": id1 = "9606." + id1 if id1 in mapping: ident = mapping[id1] elif qtype == "-1": if id1 in mapping: ident = mapping[id1] elif qtype == "-26": if id1.startswith("DOID"): ident = [id1] else: ident = [id1] for i in ident: if i not in filters: aux = pd.DataFrame(data={"Pubmedids": list(pubmedids)}) aux["START_ID"] = i aux["TYPE"] = "MENTIONED_IN_PUBLICATION" aux.to_csv(path_or_buf=f, sep='\t', header=False, index=False, quotechar='"', line_terminator='\n', escapechar='\\') aux = None
def parser(databases_directory, download=True): relationships = defaultdict(set) directory = os.path.join(databases_directory, "FooDB") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases') database_url = config['database_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] tar_fileName = os.path.join(directory, database_url.split('/')[-1]) if download: builder_utils.downloadDB(database_url, directory) contents = {} food = set() compounds = {} try: tf = tarfile.open(tar_fileName, 'r') file_content = tf.getnames() tar_dir = file_content[1] tf.extractall(path=directory) tf.close() for file_name in config['files']: path = os.path.join(directory, os.path.join(tar_dir, file_name)) with open(path, 'r', encoding="utf-8", errors='replace') as f: if file_name == "Content.csv": contents = parseContents(f) elif file_name == "Food.csv": food, mapping = parseFood(f) elif file_name == "Compound.csv": compounds = parseCompounds(f) for food_id, compound_id in contents: if compound_id in compounds: compound_code = compounds[compound_id].replace( "HMDB", "HMDB00") relationships[("food", "has_content" )].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)]) mp.reset_mapping(entity="Food") with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out: for food_id in mapping: for alias in mapping[food_id]: out.write(str(food_id) + "\t" + str(alias) + "\n") mp.mark_complete_mapping(entity="Food") except tarfile.ReadError as err: raise Exception("Error importing database FooDB.\n {}".format(err)) builder_utils.remove_directory(directory) return food, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="gwasCatalogConfig.yml", data_type='databases') url = config['GWASCat_url'] entities_header = config['entities_header'] relationships_header = config['relationships_header'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "GWAScatalog") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as catalog: for line in catalog: data = line.rstrip("\r\n").split("\t") if len(data) > 36: pubmedid = data[1] date = data[3] title = data[6] sample_size = data[8] replication_size = data[9] #chromosome = data[11] #position = data[12] #genes_mapped = data[14].split(" - ") snp_id = data[20].split('-')[0] freq = data[26] pval = data[27] odds_ratio = data[30] trait = data[34] exp_factor = data[35] study = data[36] entities.add((study, "GWAS_study", title, date, sample_size, replication_size, trait)) if pubmedid != "": relationships["published_in_publication"].add( (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog")) if snp_id != "": relationships["variant_found_in_gwas"].add( (re.sub(r"^\W+|\W+$", "", snp_id), study, "VARIANT_FOUND_IN_GWAS", freq, pval, odds_ratio, trait, "GWAS Catalog")) if exp_factor != "": exp_factor = exp_factor.split('/')[-1].replace('_', ':') relationships["studies_trait"].add( (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog")) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parse_fasta(databases_directory, config, import_directory, download=True, updated_on=None): stats = set() url = config['uniprot_fasta_file'] entities_output_file = os.path.join(import_directory, "Amino_acid_sequence.tsv") rel_output_file = os.path.join( import_directory, "Protein_HAS_Sequence_Amino_acid_sequence.tsv") directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) ff = builder_utils.read_gzipped_file(file_name) records = builder_utils.parse_fasta(ff) num_entities = 0 with open(entities_output_file, 'w', encoding='utf-8') as ef: ef.write('ID\theader\tsequence\tsize\tsource\n') with open(rel_output_file, 'w', encoding='utf-8') as rf: rf.write('START_ID\tEND_ID\tTYPE\tsource\n') for i, batch in enumerate( builder_utils.batch_iterator(records, 1000)): for record in batch: identifier = record.id.split('|')[1] header = record.id sequence = str(record.seq) sequence_len = len(str(sequence)) ef.write(identifier + "\t" + header + '\t' + sequence + '\t' + str(sequence_len) + '\tUniProt\n') rf.write(identifier + '\t' + identifier + '\tHAS_SEQUENCE\tUniProt\n') num_entities += 1 stats.add( builder_utils.buildStats(num_entities, "entity", "Amino_acid_sequence", "UniProt", entities_output_file, updated_on)) stats.add( builder_utils.buildStats(num_entities, "relationships", "HAS_SEQUENCE", "UniProt", rel_output_file, updated_on)) return stats
def getSTRINGMapping(source="BLAST_UniProt_AC", download=True, db="STRING"): """ Parses database (db) and extracts relationships between identifiers to order databases (source). :param str url: link to download database raw file. :param str source: name of the source database for selecting aliases. :param bool download: wether to download the file or not. :param str db: name of the database to be parsed. :return: Dictionary of database identifers (keys) and set of unique aliases to other databases (values). """ url = get_STRING_mapping_url(db=db) mapping = defaultdict(set) directory = os.path.join(dbconfig["databasesDir"], db) file_name = os.path.join(directory, url.split('/')[-1]) builder_utils.checkDirectory(directory) if download: print("Downloading", url, directory) builder_utils.downloadDB(url, directory) f = os.path.join(directory, file_name) first = True with gzip.open(f, 'rb') as mf: for line in mf: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split("\t") if db == "STRING": stringID = data[0] alias = data[1] sources = data[2].split(' ') else: stringID = data[0] alias = data[2] sources = data[3].split(' ') if not alias.startswith('DB'): continue if source in sources: mapping[stringID].add(alias) return mapping
def parser(databases_dir, download=True): config = builder_utils.get_config(config_name="goaConfig.yml", data_type='databases') url = config['url'] rel_header = config['header'] protein_mapping = mp.getMappingForEntity(entity="Protein") valid_proteins = list(set(protein_mapping.values)) directory = os.path.join(databases_dir, "GOA") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) annotations = parse_annotations_with_pandas(file_name, valid_proteins) builder_utils.remove_directory(directory) return annotations, rel_header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="reactomeConfig.yml", data_type='databases') urls = config['reactome_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "Reactome") builder_utils.checkDirectory(directory) metabolite_mapping = mp.getMappingForEntity("Metabolite") #drug_mapping = mp.getMappingForEntity("Drug") for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, file_name) with open(f, 'r') as rf: if dataset == "pathway": entities = parsePathways(config, databases_directory, rf) elif dataset == "hierarchy": relationships[("pathway", "has_parent")] = parsePathwayHierarchy(rf) elif dataset == "protein": relationships[( dataset, "annotated_to_pathway")] = parsePathwayRelationships( config, rf) elif dataset == "metabolite": relationships[( dataset, "annotated_to_pathway")] = parsePathwayRelationships( config, rf, metabolite_mapping) #elif dataset == "drug": #relationships[(dataset, "annotated_to_pathway")] = set() builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): relationships = set() config = builder_utils.get_config(config_name="mutationDsConfig.yml", data_type='databases') header = config['header'] output_file_name = "mutation_curated_affects_interaction_with.tsv" regex = r":(\w+)\(" url = config['mutations_url'] directory = os.path.join(databases_directory, "MutationDs") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(file_name, 'r') as mf: first = True for line in mf: if first: first = False continue data = line.rstrip("\r\n").split("\t") if len(data) > 12: internal_id = data[0] pvariant = '_'.join(data[1].split(':')) effect = data[5] organism = data[10] interaction = data[11] evidence = data[12] if organism.startswith("9606"): matches = re.finditer(regex, interaction) for matchNum, match in enumerate(matches, start=1): interactor = match.group(1) relationships.add((pvariant, interactor, "CURATED_AFFECTS_INTERACTION_WITH", effect, interaction, evidence, internal_id, "Intact-MutationDs")) builder_utils.remove_directory(directory) return (relationships, header, output_file_name)
def parser(databases_directory, download=True): config = builder_utils.get_config( config_name="drugGeneInteractionDBConfig.yml", data_type='databases') url = config['DGIdb_url'] header = config['header'] output_file = "dgidb_targets.tsv" drugmapping = mp.getMappingForEntity("Drug") relationships = set() directory = os.path.join(databases_directory, "DGIdb") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding='utf-8') as associations: first = True for line in associations: if first: first = False continue data = line.rstrip("\r\n").split("\t") gene = data[0] source = data[3] interactionType = data[4] if data[4] != '' else 'unknown' drug = data[8].lower() if drug == "": drug = data[7] if drug == "" and data[6] != "": drug = data[6] else: continue if gene != "": if drug in drugmapping: drug = drugmapping[drug] relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA", interactionType, "DGIdb: " + source)) builder_utils.remove_directory(directory) return (relationships, header, output_file)
def extract_metabolites(config, directory, download=True): metabolites = defaultdict() prefix = "{http://www.hmdb.ca}" url = config['HMDB_url'] fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) fields = config['HMDB_fields'] parentFields = config['HMDB_parentFields'] structuredFields = config['HMDB_structures'] with zipfile.ZipFile(fileName, 'r') as zipped: for zfile in zipped.namelist(): zipped.extract(member=zfile, path=directory) xfile = os.path.join(directory, zfile) with open(xfile, 'rb') as f: context = etree.iterparse(f, events=("end",), tag=prefix + "metabolite") for _, elem in context: values = {child.tag.replace(prefix, ''): child.text for child in elem.iterchildren() if child.tag.replace(prefix,'') in fields and child.text is not None} for child in elem.iterchildren(): if child.tag.replace(prefix, '') in parentFields: label = child.tag.replace(prefix, '') values[label] = set() for intchild in child.iter(): if intchild.text is not None: text = intchild.text if text.strip() != "": if label in structuredFields: if intchild.tag.replace(prefix, '') in structuredFields[label]: if len(structuredFields[label]) > 1: values[intchild.tag.replace(prefix, '')] = text else: values[label].add(text) elif intchild.tag.replace(prefix, '') in fields and text: values[label].add(text) if "accession" in values: metabolites[values["accession"]] = values return metabolites
def parser(databases_directory, download=True): directory = os.path.join(databases_directory, "ExposomeExplorer") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases') database_urls = config['database_urls'] relationships_header = config['relationships_header'] mapping = mp.getMappingForEntity("Food") correlations = {} for url in database_urls: zipped_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipped_fileName) as z: if file_name == "biomarkers.csv": biomarkers = parseBiomarkersFile(z, file_name) elif file_name == "correlations.csv": correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping) builder_utils.remove_directory(directory) return correlations, relationships_header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hgncConfig.yml", data_type='databases') url = config['hgnc_url'] entities = set() directory = os.path.join(databases_directory, "HGNC") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) taxid = 9606 entities_header = config['header'] if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as df: first = True for line in df: if first: first = False continue data = line.rstrip("\r\n").split("\t") geneSymbol = data[1] geneName = data[2] status = data[5] geneFamily = data[12] synonyms = data[18:23] transcript = data[23] if status != "Approved": continue entities.add((geneSymbol, "Gene", geneName, geneFamily, ",".join(synonyms), taxid)) #relationships.add((geneSymbol, transcript, "TRANSCRIBED_INTO")) builder_utils.remove_directory(directory) return entities, entities_header
def parse_PMC_list(config, directory, download=True, valid_pubs=None): url = config['PMC_db_url'] plinkout = config['pubmed_linkout'] entities = set() fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) entities = pd.read_csv(fileName, sep=',', dtype=str, compression='gzip', low_memory=False) entities = entities[config['PMC_fields']] entities = entities[entities.iloc[:, 0].notnull()] entities = entities.set_index(list(entities.columns)[0]) if valid_pubs is not None: valid_pubs = set(entities.index).intersection(valid_pubs) entities = entities.loc[list(valid_pubs)] entities['linkout'] = [ plinkout.replace("PUBMEDID", str(int(pubmedid))) for pubmedid in list(entities.index) ] entities.index.names = ['ID'] entities['TYPE'] = 'Publication' entities = entities.reset_index() header = [ c.replace(' ', '_').lower() if c not in ['ID', 'TYPE'] else c for c in list(entities.columns) ] entities = entities.replace('\\\\', '', regex=True) entities = list(entities.itertuples(index=False)) return entities, header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hpaConfig.yml", data_type='databases') url = config['hpa_pathology_url'] disease_mapping = mp.getMappingFromOntology(ontology="Disease", source=None) protein_mapping = mp.getMultipleMappingForEntity("Protein") directory = os.path.join(databases_directory, "HPA") builder_utils.checkDirectory(directory) compressed_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) relationships_headers = config['relationships_headers'] if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(compressed_fileName) as z: if file_name == "pathology.tsv": pathology = parsePathologyFile(config, z, file_name, protein_mapping, disease_mapping) builder_utils.remove_directory(directory) return (pathology, relationships_headers)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="pathwayCommonsConfig.yml", data_type='databases') url = config['pathwayCommons_pathways_url'] entities = set() relationships = set() directory = os.path.join(databases_directory, "PathwayCommons") builder_utils.checkDirectory(directory) fileName = url.split('/')[-1] entities_header = config['pathways_header'] relationships_header = config['relationships_header'] if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, fileName) associations = gzip.open(f, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") linkout = data[0] code = data[0].split("/")[-1] ptw_dict = dict([item.split(": ")[0], ":".join(item.split(": ")[1:])] for item in data[1].split("; ")) proteins = data[2:] if "organism" in ptw_dict and ptw_dict["organism"] == "9606": name = ptw_dict["name"] source = ptw_dict["datasource"] else: continue entities.add((code, "Pathway", name, name, ptw_dict["organism"], source, linkout)) for protein in proteins: relationships.add((protein, code, "ANNOTATED_IN_PATHWAY", linkout, "PathwayCommons: "+source)) associations.close() builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="refseqConfig.yml", data_type='databases') url = config['refseq_url'] ftp_dir = config['refseq_ftp_dir'] entities = defaultdict(set) relationships = defaultdict(set) directory = os.path.join(databases_directory, "RefSeq") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) headers = config['headerEntities'] taxid = 9606 if download: file_dir = builder_utils.list_ftp_directory(ftp_dir)[0] new_file = file_dir.split('/')[-1] + "_feature_table.txt.gz" url = ftp_dir + file_dir.split('/')[-1] + "/" + new_file builder_utils.downloadDB(url, directory) fileName = os.path.join(directory, new_file) if os.path.isfile(fileName): df = builder_utils.read_gzipped_file(fileName) first = True for line in df: if first: first = False continue data = line.rstrip("\r\n").split("\t") tclass = data[1] assembly = data[2] chrom = data[5] geneAcc = data[6] start = data[7] end = data[8] strand = data[9] protAcc = data[10] name = data[13] symbol = data[14] if protAcc != "": entities["Transcript"].add( (protAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add( (chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add( (protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) if symbol != "": relationships["TRANSCRIBED_INTO"].add( (symbol, protAcc, "TRANSCRIBED_INTO", "RefSeq")) elif geneAcc != "": entities["Transcript"].add( (geneAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add( (chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add( (protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) df.close() builder_utils.remove_directory(directory) return (entities, relationships, headers)
def parseUniProtVariants(config, databases_directory, import_directory, download=True, updated_on=None): data = defaultdict() variant_regex = r"(g\.\w+>\w)" chromosome_regex = r"(\w+)[p|q]" url = config['uniprot_variant_file'] aa = config['amino_acids'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) vf = builder_utils.read_gzipped_file(fileName) din = False i = 0 stats = set() is_first = True for line in vf: line = line if not line.startswith('#') and not din: continue elif i <= 2: din = True i += 1 continue data = line.rstrip("\r\n").split("\t") if len(data) > 9: gene = data[0] protein = data[1] pvariant = data[2] externalID = data[3] impact = data[4] clin_relevance = data[5] disease = data[6] chromosome_coord = data[8] original_source = data[13] ref = pvariant[2:5] pos = pvariant[5:-3] alt = pvariant[-3:] var_matches = re.search(variant_regex, data[9]) chr_matches = re.search(chromosome_regex, chromosome_coord) if var_matches and chr_matches: chromosome = 'chr' + chr_matches.group(1) ident = chromosome + ":" + var_matches.group(1) altName = [externalID, data[5], pvariant, chromosome_coord] if ref in aa and alt in aa: altName.append(aa[ref] + pos + aa[alt]) pvariant = protein + "_" + pvariant entities.add((ident, "Known_variant", pvariant, externalID, ",".join(altName), impact, clin_relevance, disease, original_source, "UniProt")) if chromosome != 'chr-': relationships[('Chromosome', 'known_variant_found_in_chromosome')].add( (ident, chromosome.replace('chr', ''), "VARIANT_FOUND_IN_CHROMOSOME", "UniProt")) if gene != "": relationships[('Gene', 'known_variant_found_in_gene')].add( (ident, gene, "VARIANT_FOUND_IN_GENE", "UniProt")) if protein != "": relationships[('Protein', 'known_variant_found_in_protein')].add( (ident, protein, "VARIANT_FOUND_IN_PROTEIN", "UniProt")) if len(entities) >= 1000: stats.update( print_single_file( entities, config['variants_header'], os.path.join(import_directory, "Known_variant.tsv"), "entity", "Known_variant", is_first, updated_on)) stats.update( print_multiple_relationships_files( relationships, config['relationships_header'], import_directory, is_first, updated_on)) entities = set() relationships = defaultdict(set) is_first = False if len(entities) > 0: stats.update( print_single_file( entities, config['variants_header'], os.path.join(import_directory, "Known_variant.tsv"), "entity", "Known_variant", is_first, updated_on)) stats.update( print_multiple_relationships_files(relationships, config['relationships_header'], import_directory, is_first, updated_on)) del (entities) del (relationships) return stats
def parser(databases_directory, import_directory, download=True, updated_on=None): config = builder_utils.get_config(config_name="pfamConfig.yml", data_type='databases') entity_header = config['entity_header'] relationship_headers = config['relationship_headers'] directory = os.path.join(databases_directory, 'Pfam') builder_utils.checkDirectory(directory) protein_mapping = mp.getMappingForEntity(entity="Protein") valid_proteins = list(set(protein_mapping.values())) ftp_url = config['ftp_url'] filename = config['full_uniprot_file'] # url = config['test'] if not os.path.exists(os.path.join(directory, filename)): if download: builder_utils.downloadDB(ftp_url + filename, directory) stats = set() if os.path.exists(os.path.join(directory, filename)): fhandler = builder_utils.read_gzipped_file( os.path.join(directory, filename)) identifier = None description = [] lines = [] missed = 0 entities = set() relationships = defaultdict(set) is_first = True i = 0 read_lines = 0 num_entities = 0 num_relationships = {} try: for line in fhandler: i += 1 read_lines += 1 if line.startswith("# STOCKHOLM"): if identifier is not None: entities.add((identifier, 'Functional_region', name, " ".join(description), "PFam")) if len(entities) == 100: print_files(entities, entity_header, outputfile=os.path.join( import_directory, 'Functional_region.tsv'), is_first=is_first) num_entities += len(entities) if 'mentioned_in_publication' in relationships: print_files( relationships['mentioned_in_publication'], relationship_headers[ 'mentioned_in_publication'], outputfile=os.path.join( import_directory, 'Functional_region_mentioned_in_publication.tsv' ), is_first=is_first) if 'mentioned_in_publication' not in num_relationships: num_relationships[ 'mentioned_in_publication'] = 0 num_relationships[ 'mentioned_in_publication'] += len( relationships[ 'mentioned_in_publication']) if 'found_in_protein' in relationships: print_files( relationships['found_in_protein'], relationship_headers['found_in_protein'], outputfile=os.path.join( import_directory, 'Functional_region_found_in_protein.tsv' ), is_first=is_first, filter_for=('END_ID', valid_proteins)) if 'found_in_protein' not in num_relationships: num_relationships['found_in_protein'] = 0 num_relationships['found_in_protein'] += len( relationships['found_in_protein']) entities = set() relationships = defaultdict(set) is_first = False identifier = None description = [] elif line.startswith("#=GF"): data = line.rstrip('\r\n').split() if 'AC' in data: identifier = data[2].split('.')[0] elif 'DE' in data: name = " ".join(data[2:]) elif 'RM' in data: relationships['mentioned_in_publication'].add( (identifier, data[2], "MENTIONED_IN_PUBLICATION", "PFam")) elif 'CC' in data: description.append(" ".join(data[2:])) elif not line.startswith('//'): data = line.rstrip('\r\n').split() protein, positions = data[0].split('/') protein = protein.replace('.', '-') start, end = positions.split('-') sequence = data[1] relationships['found_in_protein'].add( (identifier, protein, "FOUND_IN_PROTEIN", start, end, sequence, "PFam")) if protein.split('-')[0] != protein: relationships['found_in_protein'].add( (identifier, protein.split('-')[0], "FOUND_IN_PROTEIN", start, end, sequence, "PFam")) except UnicodeDecodeError: lines.append(i) missed += 1 fhandler.close() if len(entities) > 0: print_files(entities, entity_header, outputfile=os.path.join(import_directory, 'Functional_region.tsv'), is_first=is_first) num_entities += len(entities) print_files(relationships['mentioned_in_publication'], relationship_headers['mentioned_in_publication'], outputfile=os.path.join( import_directory, 'Functional_region_mentioned_in_publication.tsv'), is_first=is_first) num_relationships['mentioned_in_publication'] += len( relationships['mentioned_in_publication']) print_files(relationships['found_in_protein'], relationship_headers['found_in_protein'], outputfile=os.path.join( import_directory, 'Functional_region_found_in_protein.tsv'), is_first=is_first) num_relationships['found_in_protein'] += len( relationships['found_in_protein']) stats.add( builder_utils.buildStats(num_entities, "entity", "Functional_region", "Pfam", 'Functional_region.tsv', updated_on)) for rel in num_relationships: stats.add( builder_utils.buildStats(num_relationships[rel], "relationship", rel.upper(), "Pfam", 'Functional_region_' + rel + '.tsv', updated_on)) builder_utils.remove_directory(directory) return stats
def parse_release_notes(databases_directory, config, download=True): release_notes_url = config['release_notes'] directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) if download: builder_utils.downloadDB(release_notes_url, directory)
def parser(databases_directory, download=True): variant_regex = r"(\D\d+\D)$" regex = r"(chr\d+)\:g\.(\d+)(\w)>(\w)" config = builder_utils.get_config( config_name="cancerGenomeInterpreterConfig.yml", data_type='databases') url = config['cancerBiomarkers_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] amino_acids = config['amino_acids'] mapping = mp.getMappingFromOntology(ontology="Disease", source=None) drugmapping = mp.getMappingForEntity("Drug") protein_mapping = mp.getMultipleMappingForEntity("Protein") fileName = config['cancerBiomarkers_variant_file'] relationships = defaultdict(set) entities = set() directory = os.path.join(databases_directory, "CancerGenomeInterpreter") builder_utils.checkDirectory(directory) zipFile = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipFile) as z: if fileName in z.namelist(): with z.open(fileName, 'r') as responses: first = True for line in responses: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split("\t") gene_variant = data[0].split(':') if len(gene_variant) < 2: continue gene = gene_variant[0] variants = gene_variant[1].split(',') #alterationType = data[1] response = data[3] drugs = data[10].split(';') #status = data[11].split(';') evidence = data[12] tumors = data[16].split(';') publications = data[17].split(';') identifier = data[21] prot_variant = data[22] matches = re.match(regex, identifier) alternative_names = [identifier] if matches is not None: cpra = matches.groups() chromosome, position, reference, alternative = cpra variant = chromosome + ":g." + position + reference + ">" + alternative if prot_variant != "": prot_variant = prot_variant.split(':')[1] alternative_names.append(prot_variant) valid_variants = [] if gene in protein_mapping: for protein in protein_mapping[gene]: for variant in variants: match = re.search(variant_regex, variant) if match: if variant[0] in amino_acids and variant[ -1] in amino_acids: valid_variant = protein + '_p.' + amino_acids[ variant[0]] + ''.join( variant[1:-1]) + amino_acids[ variant[-1]] valid_variants.append(valid_variant) entities.add( (valid_variant, "Clinically_relevant_variant", ",".join(alternative_names), chromosome, position, reference, alternative, "", "", "CGI")) relationships[ "known_variant_is_clinically_relevant"].add( (valid_variant, valid_variant, "KNOWN_VARIANT_IS_CLINICALLY_RELEVANT", "CGI")) for drug in drugs: if drug.lower() in drugmapping: drug = drugmapping[drug.lower()] elif drug.split(" ")[0].lower() in drugmapping: drug = drugmapping[drug.split(" ")[0].lower()] elif " ".join( drug.split(" ")[1:]).lower() in drugmapping: drug = drugmapping[" ".join( drug.split(" ")[1:]).lower()] relationships["targets"].add( (drug, gene, "CURATED_TARGETS", evidence, response, ",".join(tumors), "curated", "CGI")) for valid_variant in valid_variants: relationships[ "targets_clinically_relevant_variant"].add( (drug, valid_variant, "TARGETS_CLINICALLY_RELEVANT_VARIANT", evidence, response, "".join(tumors), "curated", "CGI")) for tumor in tumors: if tumor.lower() in mapping: tumor = mapping[tumor.lower()] for valid_variant in valid_variants: relationships["associated_with"].add( (valid_variant, tumor, "ASSOCIATED_WITH", "curated", "curated", "CGI", len(publications))) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_headers)
def parse_ontology(ontology, download=True): """ Parses and extracts data from a given ontology file(s), and returns a tuple with multiple dictionaries. :param str ontology: acronym of the ontology to be parsed (e.g. Disease Ontology:'DO'). :param bool download: wether database is to be downloaded. :return: Tuple with three nested dictionaries: terms, relationships between terms, and definitions of the terms.\ For more information on the returned dictionaries, see the documentation for any ontology parser. """ directory = ckg_config['ontologies_directory'] ontology_directory = os.path.join(directory, ontology) builder_utils.checkDirectory(ontology_directory) ontology_files = [] ontologyData = None mappings = None extra_entities = set() extra_rels = set() if ontology in config["ontology_types"]: otype = config["ontology_types"][ontology] if 'urls' in config: if otype in config['urls']: urls = config['urls'][otype] for url in urls: f = url.split('/')[-1].replace('?', '_').replace('=', '_') ontology_files.append(os.path.join(ontology_directory, f)) if download: builder_utils.downloadDB(url, directory=ontology_directory, file_name=f) elif otype in config["files"]: ofiles = config["files"][otype] for f in ofiles: if '*' not in f: if os.path.isfile(os.path.join(directory, f)): ontology_files.append(os.path.join(directory, f)) else: logger.error( "Error: file {} is not in the directory {}". format(f, directory)) else: ontology_files.append(os.path.join(directory, f)) filters = None if otype in config["parser_filters"]: filters = config["parser_filters"][otype] extra_entities, extra_rels = get_extra_entities_rels( ontology_directory) if len(ontology_files) > 0: if ontology == "SNOMED-CT": ontologyData = snomedParser.parser(ontology_files, filters) elif ontology == "ICD": ontologyData = icdParser.parser(ontology_files) elif ontology == 'EFO': ontologyData, mappings = efoParser.parser(ontology_files) else: ontologyData = oboParser.parser(ontology, ontology_files) mp.buildMappingFromOBO(ontology_files[0], ontology, ontology_directory) else: if ontology == "SNOMED-CT": logger.info( "WARNING: SNOMED-CT terminology needs to be downloaded manually since it requires UMLS License. More information available here: https://www.nlm.nih.gov/databases/umls.html" ) else: logger.info( "WARNING: Ontology {} could not be downloaded. Check that the link in configuration works." .format(ontology)) return ontologyData, mappings, extra_entities, extra_rels