def parser(databases_directory, download=True): relationships = defaultdict(set) directory = os.path.join(databases_directory, "FooDB") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases') database_url = config['database_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] tar_fileName = os.path.join(directory, database_url.split('/')[-1]) if download: builder_utils.downloadDB(database_url, directory) contents = {} food = set() compounds = {} try: tf = tarfile.open(tar_fileName, 'r') file_content = tf.getnames() tar_dir = file_content[1] tf.extractall(path=directory) tf.close() for file_name in config['files']: path = os.path.join(directory, os.path.join(tar_dir, file_name)) with open(path, 'r', encoding="utf-8", errors='replace') as f: if file_name == "Content.csv": contents = parseContents(f) elif file_name == "Food.csv": food, mapping = parseFood(f) elif file_name == "Compound.csv": compounds = parseCompounds(f) for food_id, compound_id in contents: if compound_id in compounds: compound_code = compounds[compound_id].replace( "HMDB", "HMDB00") relationships[("food", "has_content" )].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)]) mp.reset_mapping(entity="Food") with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out: for food_id in mapping: for alias in mapping[food_id]: out.write(str(food_id) + "\t" + str(alias) + "\n") mp.mark_complete_mapping(entity="Food") except tarfile.ReadError as err: raise Exception("Error importing database FooDB.\n {}".format(err)) builder_utils.remove_directory(directory) return food, relationships, entities_header, relationships_headers
def build_DrugBank_dictionary(config, directory, drugs): filename = config['DrugBank_dictionary_file'] outputfile = os.path.join(directory, filename) mp.reset_mapping(entity="Drug") with open(outputfile, 'w', encoding='utf-8') as out: for did in drugs: if "name" in drugs[did]: name = drugs[did]["name"] out.write(did+"\t"+name.lower()+"\n") if "synonyms" in drugs[did]: for synonym in drugs[did]["synonyms"]: out.write(did+"\t"+synonym.lower()+"\n") mp.mark_complete_mapping(entity="Drug")
def build_HMDB_dictionary(directory, metabolites): filename = "mapping.tsv" outputfile = os.path.join(directory, filename) mp.reset_mapping(entity="Metabolite") with open(outputfile, 'w', encoding='utf-8') as out: for metid in metabolites: if "name" in metabolites[metid]: name = metabolites[metid]["name"] out.write(metid+"\t"+name.lower()+"\n") if "synonyms" in metabolites[metid]: for synonym in metabolites[metid]["synonyms"]: out.write(metid+"\t"+synonym.lower()+"\n") if "chebi_id" in metabolites[metid]: chebi_id = metabolites[metid]["chebi_id"] out.write(metid+"\t"+chebi_id+"\n") mp.mark_complete_mapping(entity="Metabolite")
def parsePathways(config, databases_directory, fhandler): entities = set() organisms = config['organisms'] url = config['linkout_url'] directory = os.path.join(databases_directory, "Reactome") mapping_file = os.path.join(directory, "mapping.tsv") mp.reset_mapping(entity="Pathway") with open(mapping_file, 'w') as mf: for line in fhandler: data = line.rstrip("\r\n").split("\t") identifier = data[0] name = data[1] organism = data[2] linkout = url.replace("PATHWAY", identifier) if organism in organisms: organism = organisms[organism] entities.add((identifier, "Pathway", name, name, organism, linkout, "Reactome")) mf.write(identifier + "\t" + name + "\n") mp.mark_complete_mapping(entity="Pathway") return entities
def parse_idmapping_file(databases_directory, config, import_directory, download=True, updated_on=None): regex_transcript = r"(-\d+$)" taxids = config['species'] proteins_output_file = os.path.join(import_directory, "Protein.tsv") pdbs_output_file = os.path.join(import_directory, "Protein_structures.tsv") proteins = {} url = config['uniprot_id_url'] directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) mapping_file = os.path.join(directory, 'mapping.tsv') if download: builder_utils.downloadDB(url, directory) fields = config['uniprot_ids'] synonymFields = config['uniprot_synonyms'] identifier = None transcripts = set() skip = set() is_first = True uf = builder_utils.read_gzipped_file(file_name) aux = {} stats = set() mp.reset_mapping(entity="Protein") with open(mapping_file, 'w', encoding='utf-8') as out: for line in uf: data = line.rstrip("\r\n").split("\t") iid = data[0] if 'UniParc' in data: if re.search(regex_transcript, iid): transcripts.add(iid) continue # data = line.decode('utf-8').rstrip("\r\n").split("\t") # iid = data[0] field = data[1] alias = data[2] if iid not in skip: skip = set() if re.search(regex_transcript, iid): transcripts.add(iid) if iid not in aux and iid.split('-')[0] not in aux: if identifier is not None: prot_info["synonyms"] = synonyms aux[identifier].update(prot_info) if "UniProtKB-ID" in aux[ identifier] and "NCBI_TaxID" in aux[identifier]: for synonym in synonyms: out.write(identifier + "\t" + synonym + "\n") proteins[identifier] = aux[identifier] for t in transcripts: proteins[t] = aux[identifier] for synonym in synonyms: out.write(t + "\t" + synonym + "\n") if len(transcripts) > 0: proteins[identifier].update( {"isoforms": transcripts}) transcripts = set() aux.pop(identifier, None) if len(proteins) >= 1000: entities, relationships, pdb_entities = format_output( proteins) stats.update( print_single_file( entities, config['proteins_header'], proteins_output_file, "entity", "Protein", is_first, updated_on)) stats.update( print_single_file(pdb_entities, config['pdb_header'], pdbs_output_file, "entity", "Protein_structure", is_first, updated_on)) stats.update( print_multiple_relationships_files( relationships, config['relationships_header'], import_directory, is_first, updated_on)) is_first = False proteins = {} identifier = iid transcripts = set() aux[identifier] = {} prot_info = {} synonyms = [] if field in fields: if field == 'NCBI_TaxID': if int(alias) not in taxids: skip.add(identifier) aux.pop(identifier, None) identifier = None if field in synonymFields: synonyms.append(alias) prot_info.setdefault(field, []) prot_info[field].append(alias) uf.close() if len(proteins) > 0: entities, relationships, pdb_entities = format_output(proteins) stats.update( print_single_file(entities, config['proteins_header'], proteins_output_file, "entity", "Protein", is_first, updated_on)) stats.update( print_single_file(pdb_entities, config['pdb_header'], pdbs_output_file, "entity", "Protein_structure", is_first, updated_on)) stats.update( print_multiple_relationships_files(relationships, config['relationships_header'], import_directory, is_first, updated_on)) mp.mark_complete_mapping(entity="Protein") return stats