Exemple #1
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "FooDB")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="foodbConfig.yml",
                                      data_type='databases')

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    tar_fileName = os.path.join(directory, database_url.split('/')[-1])
    if download:
        builder_utils.downloadDB(database_url, directory)

    contents = {}
    food = set()
    compounds = {}
    try:
        tf = tarfile.open(tar_fileName, 'r')
        file_content = tf.getnames()
        tar_dir = file_content[1]
        tf.extractall(path=directory)
        tf.close()
        for file_name in config['files']:
            path = os.path.join(directory, os.path.join(tar_dir, file_name))
            with open(path, 'r', encoding="utf-8", errors='replace') as f:
                if file_name == "Content.csv":
                    contents = parseContents(f)
                elif file_name == "Food.csv":
                    food, mapping = parseFood(f)
                elif file_name == "Compound.csv":
                    compounds = parseCompounds(f)
        for food_id, compound_id in contents:
            if compound_id in compounds:
                compound_code = compounds[compound_id].replace(
                    "HMDB", "HMDB00")
                relationships[("food", "has_content"
                               )].add((food_id, compound_code, "HAS_CONTENT") +
                                      contents[(food_id, compound_id)])
        mp.reset_mapping(entity="Food")
        with open(os.path.join(directory, "mapping.tsv"),
                  'w',
                  encoding='utf-8') as out:
            for food_id in mapping:
                for alias in mapping[food_id]:
                    out.write(str(food_id) + "\t" + str(alias) + "\n")

        mp.mark_complete_mapping(entity="Food")
    except tarfile.ReadError as err:
        raise Exception("Error importing database FooDB.\n {}".format(err))

    builder_utils.remove_directory(directory)

    return food, relationships, entities_header, relationships_headers
Exemple #2
0
def build_DrugBank_dictionary(config, directory, drugs):
    filename = config['DrugBank_dictionary_file']
    outputfile = os.path.join(directory, filename)
    mp.reset_mapping(entity="Drug")
    with open(outputfile, 'w', encoding='utf-8') as out:
        for did in drugs:
            if "name" in drugs[did]:
                name = drugs[did]["name"]
                out.write(did+"\t"+name.lower()+"\n")
            if "synonyms" in drugs[did]:
                for synonym in drugs[did]["synonyms"]:
                    out.write(did+"\t"+synonym.lower()+"\n")

    mp.mark_complete_mapping(entity="Drug")
Exemple #3
0
def build_HMDB_dictionary(directory, metabolites):
    filename = "mapping.tsv"
    outputfile = os.path.join(directory, filename)
    mp.reset_mapping(entity="Metabolite")
    with open(outputfile, 'w', encoding='utf-8') as out:
        for metid in metabolites:
            if "name" in metabolites[metid]:
                name = metabolites[metid]["name"]
                out.write(metid+"\t"+name.lower()+"\n")
            if "synonyms" in metabolites[metid]:
                for synonym in metabolites[metid]["synonyms"]:
                    out.write(metid+"\t"+synonym.lower()+"\n")
            if "chebi_id" in metabolites[metid]:
                chebi_id = metabolites[metid]["chebi_id"]
                out.write(metid+"\t"+chebi_id+"\n")

    mp.mark_complete_mapping(entity="Metabolite")
Exemple #4
0
def parsePathways(config, databases_directory, fhandler):
    entities = set()
    organisms = config['organisms']
    url = config['linkout_url']
    directory = os.path.join(databases_directory, "Reactome")
    mapping_file = os.path.join(directory, "mapping.tsv")

    mp.reset_mapping(entity="Pathway")
    with open(mapping_file, 'w') as mf:
        for line in fhandler:
            data = line.rstrip("\r\n").split("\t")
            identifier = data[0]
            name = data[1]
            organism = data[2]
            linkout = url.replace("PATHWAY", identifier)
            if organism in organisms:
                organism = organisms[organism]
                entities.add((identifier, "Pathway", name, name, organism,
                              linkout, "Reactome"))
                mf.write(identifier + "\t" + name + "\n")

    mp.mark_complete_mapping(entity="Pathway")

    return entities
Exemple #5
0
def parse_idmapping_file(databases_directory,
                         config,
                         import_directory,
                         download=True,
                         updated_on=None):
    regex_transcript = r"(-\d+$)"
    taxids = config['species']

    proteins_output_file = os.path.join(import_directory, "Protein.tsv")
    pdbs_output_file = os.path.join(import_directory, "Protein_structures.tsv")
    proteins = {}

    url = config['uniprot_id_url']
    directory = os.path.join(databases_directory, "UniProt")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    mapping_file = os.path.join(directory, 'mapping.tsv')
    if download:
        builder_utils.downloadDB(url, directory)

    fields = config['uniprot_ids']
    synonymFields = config['uniprot_synonyms']

    identifier = None
    transcripts = set()
    skip = set()
    is_first = True
    uf = builder_utils.read_gzipped_file(file_name)
    aux = {}
    stats = set()
    mp.reset_mapping(entity="Protein")
    with open(mapping_file, 'w', encoding='utf-8') as out:
        for line in uf:
            data = line.rstrip("\r\n").split("\t")
            iid = data[0]
            if 'UniParc' in data:
                if re.search(regex_transcript, iid):
                    transcripts.add(iid)
                continue
            # data = line.decode('utf-8').rstrip("\r\n").split("\t")
            # iid = data[0]
            field = data[1]
            alias = data[2]

            if iid not in skip:
                skip = set()
                if re.search(regex_transcript, iid):
                    transcripts.add(iid)
                if iid not in aux and iid.split('-')[0] not in aux:
                    if identifier is not None:
                        prot_info["synonyms"] = synonyms
                        aux[identifier].update(prot_info)
                        if "UniProtKB-ID" in aux[
                                identifier] and "NCBI_TaxID" in aux[identifier]:
                            for synonym in synonyms:
                                out.write(identifier + "\t" + synonym + "\n")
                            proteins[identifier] = aux[identifier]
                            for t in transcripts:
                                proteins[t] = aux[identifier]
                                for synonym in synonyms:
                                    out.write(t + "\t" + synonym + "\n")
                            if len(transcripts) > 0:
                                proteins[identifier].update(
                                    {"isoforms": transcripts})
                                transcripts = set()
                            aux.pop(identifier, None)
                            if len(proteins) >= 1000:
                                entities, relationships, pdb_entities = format_output(
                                    proteins)
                                stats.update(
                                    print_single_file(
                                        entities, config['proteins_header'],
                                        proteins_output_file, "entity",
                                        "Protein", is_first, updated_on))
                                stats.update(
                                    print_single_file(pdb_entities,
                                                      config['pdb_header'],
                                                      pdbs_output_file,
                                                      "entity",
                                                      "Protein_structure",
                                                      is_first, updated_on))
                                stats.update(
                                    print_multiple_relationships_files(
                                        relationships,
                                        config['relationships_header'],
                                        import_directory, is_first,
                                        updated_on))
                                is_first = False
                                proteins = {}
                    identifier = iid
                    transcripts = set()
                    aux[identifier] = {}
                    prot_info = {}
                    synonyms = []
                if field in fields:
                    if field == 'NCBI_TaxID':
                        if int(alias) not in taxids:
                            skip.add(identifier)
                            aux.pop(identifier, None)
                            identifier = None
                    if field in synonymFields:
                        synonyms.append(alias)
                    prot_info.setdefault(field, [])
                    prot_info[field].append(alias)

        uf.close()

    if len(proteins) > 0:
        entities, relationships, pdb_entities = format_output(proteins)
        stats.update(
            print_single_file(entities, config['proteins_header'],
                              proteins_output_file, "entity", "Protein",
                              is_first, updated_on))
        stats.update(
            print_single_file(pdb_entities, config['pdb_header'],
                              pdbs_output_file, "entity", "Protein_structure",
                              is_first, updated_on))
        stats.update(
            print_multiple_relationships_files(relationships,
                                               config['relationships_header'],
                                               import_directory, is_first,
                                               updated_on))

    mp.mark_complete_mapping(entity="Protein")

    return stats