Beispiel #1
0
def create_user_from_command_line(args, expiration):
    """
    Creates new user in the graph database and corresponding node, from a terminal window (command line), \
    and adds the new user information to the users excel and import files. Arguments as in set_arguments().

    :param args: object. Contains all the parameters neccessary to create a user ('username', 'name', 'email', \
                'secondary_email', 'phone_number' and 'affiliation').
    :type args: any object with __dict__ attribute
    :param int expiration: number of days users is given access.

    .. note:: This function can be used directly with *python create_user_from_command_line.py -u username \
                -n user_name -e email -s secondary_email -p phone_number -a affiliation* .
    """
    usersImportDirectory = ckg_config['imports_users_directory']
    usersFile = os.path.join(usersImportDirectory, uconfig['usersFile'])

    builder_utils.checkDirectory(usersImportDirectory)
    import_file = os.path.join(usersImportDirectory, uconfig['import_file'])

    data = vars(args)
    df = pd.DataFrame.from_dict(data, orient='index').T.drop('file', axis=1)
    create_user(df, import_file, expiration)

    if os.path.exists(usersFile):
        excel = pd.read_excel(usersFile, index=0)
        excel = excel.append(data, ignore_index=True)
        excel.to_excel(usersFile, index=False)
    else:
        df.to_excel(usersFile, index=False)
Beispiel #2
0
def parser(databases_directory, drug_source, download=True):
    config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases')
    url = config['SIDER_url']
    header = config['header']

    output_file = 'sider_has_side_effect.tsv'

    drugmapping = mp.getSTRINGMapping(source=drug_source, download=download, db="STITCH")
    phenotypemapping = mp.getMappingFromOntology(ontology="Phenotype", source=config['SIDER_source'])

    relationships = set()
    directory = os.path.join(databases_directory, "SIDER")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    associations = gzip.open(fileName, 'r')
    for line in associations:
        data = line.decode('utf-8').rstrip("\r\n").split("\t")
        drug = re.sub(r'CID\d', 'CIDm', data[0])
        se = data[2]
        evidence_from = str(data[3])
        #freq = data[4]
        #lower_bound = data[5]
        #upper_bound = data[6]
        if se.lower() in phenotypemapping and drug in drugmapping:
            for d in drugmapping[drug]:
                p = phenotypemapping[se.lower()]
                relationships.add((d, p, "HAS_SIDE_EFFECT", "SIDER", se, evidence_from))
    associations.close()

    return (relationships, header, output_file, drugmapping, phenotypemapping)
Beispiel #3
0
def parser(databases_directory, importDirectory, download=True):
    config = builder_utils.get_config(config_name="jensenlabConfig.yml",
                                      data_type='databases')
    outputfileName = "Publications.tsv"
    url = config['db_url']
    ifile = config['organisms_file']
    organisms = str(config['organisms'])
    directory = os.path.join(databases_directory, "Jensenlab")
    builder_utils.checkDirectory(os.path.join(directory, "textmining"))

    if download:
        builder_utils.downloadDB(url.replace("FILE", ifile),
                                 os.path.join(directory, "textmining"))

    ifile = os.path.join(directory, os.path.join("textmining", ifile))
    valid_pubs = read_valid_pubs(organisms, ifile)
    entities, header = parse_PMC_list(config,
                                      os.path.join(directory, "textmining"),
                                      download=download,
                                      valid_pubs=valid_pubs)
    num_entities = len(entities)
    outputfile = os.path.join(importDirectory, outputfileName)
    builder_utils.write_entities(entities, header, outputfile)
    entities = None

    for qtype in config['db_mentions_types']:
        parse_mentions(config, directory, qtype, importDirectory, download)

    builder_utils.remove_directory(os.path.join(directory, "textmining"))

    return (num_entities, outputfile)
Beispiel #4
0
def create_user_from_file(filepath, expiration):
    """
    Creates new user in the graph database and corresponding node, from an excel file. \
    Rows in the file must be users, and columns must follow set_arguments() fields.

    :param str filepath: filepath and filename containing users information.
    :param str output_file: path to output csv file.
    :param int expiration: number of days users is given access.

    .. note:: This function can be used directly with *python create_user_from_file.py -f path_to_file* .
    """
    usersImportDirectory = ckg_config['imports_users_directory']
    usersFile = os.path.join(usersImportDirectory, uconfig['usersFile'])

    builder_utils.checkDirectory(usersImportDirectory)
    import_file = os.path.join(usersImportDirectory, uconfig['import_file'])

    data = vars(args)
    data = pd.read_excel(data['file']).applymap(str)
    create_user(data, import_file, expiration)

    if os.path.exists(usersFile):
        excel = pd.read_excel(usersFile, index=0)
        excel = excel.append(data.drop('file', axis=1), ignore_index=True)
        excel.to_excel(usersFile, index=False)
    else:
        data.to_excel(usersFile, index=False)
Beispiel #5
0
def parseUniProtAnnotations(config, databases_directory, download=True):
    roots = {
        'F': 'Molecular_function',
        'C': 'Cellular_component',
        'P': 'Biological_process'
    }
    url = config['uniprot_go_annotations']
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "UniProt")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    af = builder_utils.read_gzipped_file(fileName)
    for line in af:
        line = line
        if line.startswith('!'):
            continue
        data = line.rstrip("\r\n").split("\t")
        identifier = data[1]
        go = data[4]
        evidence = data[6]
        root = data[8]
        if root in roots:
            root = roots[root]
            relationships[(root, 'associated_with')].add(
                (identifier, go, "ASSOCIATED_WITH", evidence, 5, "UniProt"))

    return relationships
Beispiel #6
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="smpdbConfig.yml",
                                      data_type='databases')
    urls = config['smpdb_urls']
    entities = set()
    relationships = defaultdict(set)
    entities_header = config['pathway_header']
    relationships_headers = config['relationships_header']
    directory = os.path.join(databases_directory, "SMPDB")
    builder_utils.checkDirectory(directory)

    for dataset in urls:
        url = urls[dataset]
        file_name = url.split('/')[-1]
        if download:
            builder_utils.downloadDB(url, directory)
        zipped_file = os.path.join(directory, file_name)
        with zipfile.ZipFile(zipped_file) as rf:
            if dataset == "pathway":
                entities = parsePathways(config, rf)
            elif dataset == "protein":
                relationships.update(parsePathwayProteinRelationships(rf))
            elif dataset == "metabolite":
                relationships.update(
                    parsePathwayMetaboliteDrugRelationships(rf))

    builder_utils.remove_directory(directory)

    return entities, relationships, entities_header, relationships_headers
Beispiel #7
0
def parsePairs(config, databases_directory, qtype, mapping, download=True):
    url = config['db_url']
    ifile = config['db_files'][qtype]
    source = config['db_sources'][qtype]
    relationships = set()

    directory = os.path.join(databases_directory, "Jensenlab")
    builder_utils.checkDirectory(os.path.join(directory, "integration"))

    if download:
        builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "integration"))
    ifile = os.path.join(directory,os.path.join("integration", ifile))

    with open(ifile, 'r') as idbf:
        for line in idbf:
            data = line.rstrip("\r\n").split('\t')
            id1 = "9606."+data[0]
            id2 = data[2]
            score = float(data[4])

            if id1 in mapping:
                for ident in mapping[id1]:
                    relationships.add((ident, id2, "ASSOCIATED_WITH_INTEGRATED", source, score, "compiled"))
            else:
                continue

    return relationships
Beispiel #8
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    config = builder_utils.get_config(config_name="disgenetConfig.yml",
                                      data_type='databases')

    files = config['disgenet_files']
    mapping_files = config['disgenet_mapping_files']
    url = config['disgenet_url']
    directory = os.path.join(databases_directory, "disgenet")
    builder_utils.checkDirectory(directory)
    header = config['disgenet_header']
    output_file = 'disgenet_associated_with.tsv'

    if download:
        for f in files:
            builder_utils.downloadDB(url + files[f], directory)
        for f in mapping_files:
            builder_utils.downloadDB(url + mapping_files[f], directory)

    proteinMapping = readDisGeNetProteinMapping(config, directory)
    diseaseMapping = readDisGeNetDiseaseMapping(config, directory)
    for f in files:
        first = True
        associations = gzip.open(os.path.join(directory, files[f]), 'r')
        dtype, atype = f.split('_')
        if dtype == 'gene':
            idType = "Protein"
            scorePos = 9
        if dtype == 'variant':
            idType = "Transcript"
            scorePos = 5
        for line in associations:
            if first:
                first = False
                continue
            try:
                data = line.decode('utf-8').rstrip("\r\n").split("\t")
                geneId = str(int(data[0]))
                #disease_specificity_index =  data[2]
                #disease_pleiotropy_index = data[3]
                diseaseId = data[4]
                score = float(data[scorePos])
                pmids = data[13]
                source = data[-1]
                if geneId in proteinMapping:
                    for identifier in proteinMapping[geneId]:
                        if diseaseId in diseaseMapping:
                            for code in diseaseMapping[diseaseId]:
                                code = "DOID:" + code
                                relationships[idType].add(
                                    (identifier, code, "ASSOCIATED_WITH",
                                     score, atype, "DisGeNet: " + source,
                                     pmids))
            except UnicodeDecodeError:
                continue
        associations.close()

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
Beispiel #9
0
def parserIndications(databases_directory, drugMapping, phenotypeMapping, download=True):
    config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases')
    url = config['SIDER_indications']
    header = config['indications_header']
    output_file = 'sider_is_indicated_for.tsv'

    relationships = set()
    directory = os.path.join(databases_directory, "SIDER")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    associations = gzip.open(fileName, 'r')
    for line in associations:
        data = line.decode('utf-8').rstrip("\r\n").split("\t")
        drug = re.sub(r'CID\d', 'CIDm', data[0])
        se = data[1]
        evidence = data[2]
        if se.lower() in phenotypeMapping and drug in drugMapping:
            for d in drugMapping[drug]:
                p = phenotypeMapping[se.lower()]
                relationships.add((d, p, "IS_INDICATED_FOR", evidence, "SIDER", se))

    associations.close()

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
Beispiel #10
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="signorConfig.yml",
                                      data_type='databases')

    directory = os.path.join(databases_directory, "SIGNOR")
    builder_utils.checkDirectory(directory)

    url = config['url']
    modifications = config['modifications']
    amino_acids = config['amino_acids']
    accronyms = config['accronyms']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']

    entities = set()
    relationships = defaultdict(set)

    filename = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    entities, relationships = parse_substrates(filename, modifications,
                                               accronyms, amino_acids)

    return entities, relationships, entities_header, relationships_headers
Beispiel #11
0
def parseUniProtPeptides(config, databases_directory, download=True):
    file_urls = config['uniprot_peptides_files']
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "UniProt")
    builder_utils.checkDirectory(directory)
    for url in file_urls:
        fileName = os.path.join(directory, url.split('/')[-1])
        if download:
            builder_utils.downloadDB(url, directory)
        first = True
        with open(fileName, 'r', encoding='utf-8') as f:
            for line in f:
                if first:
                    first = False
                    continue

                data = line.rstrip("\r\n").split("\t")
                peptide = data[0]
                accs = data[6].split(",")
                is_unique = True
                if len(accs) > 1:
                    is_unique = False
                entities.add(
                    (peptide, "Peptide", "tryptic peptide", is_unique))
                for protein in accs:
                    relationships[("Peptide", 'belongs_to_protein')].add(
                        (peptide, protein, "BELONGS_TO_PROTEIN", "UniProt"))
    return entities, relationships
Beispiel #12
0
def save_files_in_tmp(content, dataset, prot_tool, prot_file, projectid,
                      uploaded_file):
    if dataset is not None:
        session_cookie = flask.request.cookies.get('custom-auth-session')
        temporaryDirectory = os.path.join(ckg_config['tmp_directory'],
                                          session_cookie + "upload")
        if not os.path.exists(ckg_config['tmp_directory']):
            os.makedirs(ckg_config['tmp_directory'])
        elif not os.path.exists(temporaryDirectory):
            os.makedirs(temporaryDirectory)

        directory = os.path.join(temporaryDirectory, dataset)
        if os.path.exists(directory) and uploaded_file is not None:
            if os.path.exists(os.path.join(directory, uploaded_file)):
                shutil.rmtree(directory)

        builder_utils.checkDirectory(directory)
        if dataset in [
                'proteomics', 'interactomics', 'phosphoproteomics'
        ] and prot_tool != '' and (prot_file != '' or prot_tool == 'mzTab'):
            selected_file = prot_tool.lower() + "-" + prot_file.lower()
            if selected_file in config['file_proteomics']:
                filename = config['file_proteomics'][selected_file]
            else:
                if prot_tool == 'mzTab':
                    filename = dataset + '_' + prot_tool.lower() + '.mztab'
                else:
                    filename = dataset + '_' + prot_tool.lower(
                    ) + '_' + prot_file.replace(
                        ' ', '').lower() + '.' + uploaded_file.split('.')[-1]
            directory = os.path.join(directory, prot_tool.lower())
            if os.path.exists(directory):
                if os.path.exists(os.path.join(directory, filename)):
                    os.remove(os.path.join(directory, filename))
            builder_utils.checkDirectory(directory)
        elif dataset == 'experimental_design':
            filename = config['file_design'].split(
                '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1]
        elif dataset == 'clinical':
            filename = config['file_clinical'].split(
                '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1]

        if uploaded_file is None:
            content = None
        if content is not None:
            data = builder_utils.parse_contents(content, filename)
            builder_utils.export_contents(data, directory, filename)

            uploaded = uploaded_file
            uploaded_file = None
            return uploaded, uploaded_file, '', ''
        else:
            raise PreventUpdate

    return '', None, '', ''
Beispiel #13
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "FooDB")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="foodbConfig.yml",
                                      data_type='databases')

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    tar_fileName = os.path.join(directory, database_url.split('/')[-1])
    if download:
        builder_utils.downloadDB(database_url, directory)

    contents = {}
    food = set()
    compounds = {}
    try:
        tf = tarfile.open(tar_fileName, 'r')
        file_content = tf.getnames()
        tar_dir = file_content[1]
        tf.extractall(path=directory)
        tf.close()
        for file_name in config['files']:
            path = os.path.join(directory, os.path.join(tar_dir, file_name))
            with open(path, 'r', encoding="utf-8", errors='replace') as f:
                if file_name == "Content.csv":
                    contents = parseContents(f)
                elif file_name == "Food.csv":
                    food, mapping = parseFood(f)
                elif file_name == "Compound.csv":
                    compounds = parseCompounds(f)
        for food_id, compound_id in contents:
            if compound_id in compounds:
                compound_code = compounds[compound_id].replace(
                    "HMDB", "HMDB00")
                relationships[("food", "has_content"
                               )].add((food_id, compound_code, "HAS_CONTENT") +
                                      contents[(food_id, compound_id)])
        mp.reset_mapping(entity="Food")
        with open(os.path.join(directory, "mapping.tsv"),
                  'w',
                  encoding='utf-8') as out:
            for food_id in mapping:
                for alias in mapping[food_id]:
                    out.write(str(food_id) + "\t" + str(alias) + "\n")

        mp.mark_complete_mapping(entity="Food")
    except tarfile.ReadError as err:
        raise Exception("Error importing database FooDB.\n {}".format(err))

    builder_utils.remove_directory(directory)

    return food, relationships, entities_header, relationships_headers
Beispiel #14
0
def parser(databases_directory):
    config = builder_utils.get_config(config_name="drugBankConfig.yml", data_type='databases')
    directory = os.path.join(databases_directory, "DrugBank")
    builder_utils.checkDirectory(directory)
    drugs = extract_drugs(config, directory)
    build_DrugBank_dictionary(config, directory, drugs)
    relationships = build_relationships_from_DrugBank(config, drugs)
    entities, attributes = build_drug_entity(config, drugs)
    entities_header = ['ID'] + attributes
    relationships_headers = config['relationships_headers']

    return (entities, relationships, entities_header, relationships_headers)
Beispiel #15
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="gwasCatalogConfig.yml",
                                      data_type='databases')
    url = config['GWASCat_url']
    entities_header = config['entities_header']
    relationships_header = config['relationships_header']
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "GWAScatalog")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding="utf-8") as catalog:
        for line in catalog:
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 36:
                pubmedid = data[1]
                date = data[3]
                title = data[6]
                sample_size = data[8]
                replication_size = data[9]
                #chromosome = data[11]
                #position = data[12]
                #genes_mapped = data[14].split(" - ")
                snp_id = data[20].split('-')[0]
                freq = data[26]
                pval = data[27]
                odds_ratio = data[30]
                trait = data[34]
                exp_factor = data[35]
                study = data[36]

                entities.add((study, "GWAS_study", title, date, sample_size,
                              replication_size, trait))
                if pubmedid != "":
                    relationships["published_in_publication"].add(
                        (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog"))
                if snp_id != "":
                    relationships["variant_found_in_gwas"].add(
                        (re.sub(r"^\W+|\W+$", "",
                                snp_id), study, "VARIANT_FOUND_IN_GWAS", freq,
                         pval, odds_ratio, trait, "GWAS Catalog"))
                if exp_factor != "":
                    exp_factor = exp_factor.split('/')[-1].replace('_', ':')
                    relationships["studies_trait"].add(
                        (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog"))

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
Beispiel #16
0
def experimentImport(importDirectory, experimentsDirectory, project):
    """
    Generates all the entities and relationships from the specified Project. Called from function experimentsImport.

    :param str importDirectory: path to the directory where all the import files are generated.
    :param str experimentDirectory: path to the directory where all the experiments are located.
    :param str project: identifier of the project to be imported.
    """
    projectPath = os.path.join(importDirectory, project)
    builder_utils.checkDirectory(projectPath)
    projectDirectory = os.path.join(experimentsDirectory, project)
    datasets = builder_utils.listDirectoryFolders(projectDirectory)
    if 'project' in datasets:
        dataset = 'project'
        datasetPath = os.path.join(projectPath, dataset)
        builder_utils.checkDirectory(datasetPath)
        eh.generate_dataset_imports(project, dataset, datasetPath)
        datasets.remove(dataset)
        if 'experimental_design' in datasets:
            dataset = 'experimental_design'
            datasetPath = os.path.join(projectPath, dataset)
            builder_utils.checkDirectory(datasetPath)
            eh.generate_dataset_imports(project, dataset, datasetPath)
            datasets.remove(dataset)
            for dataset in datasets:
                datasetPath = os.path.join(projectPath, dataset)
                builder_utils.checkDirectory(datasetPath)
                eh.generate_dataset_imports(project, dataset, datasetPath)
Beispiel #17
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hmdbConfig.yml", data_type='databases')
    directory = os.path.join(databases_directory, "HMDB")
    builder_utils.checkDirectory(directory)
    metabolites = extract_metabolites(config, directory, download)
    mapping = mp.getMappingFromOntology(ontology="Disease", source=config['HMDB_DO_source'])
    mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None))
    entities, attributes = build_metabolite_entity(config, directory, metabolites)
    relationships = build_relationships_from_HMDB(config, metabolites, mapping)
    entities_header = ['ID'] + attributes
    relationships_header = config['relationships_header']

    #builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
Beispiel #18
0
def parse_fasta(databases_directory,
                config,
                import_directory,
                download=True,
                updated_on=None):
    stats = set()
    url = config['uniprot_fasta_file']
    entities_output_file = os.path.join(import_directory,
                                        "Amino_acid_sequence.tsv")
    rel_output_file = os.path.join(
        import_directory, "Protein_HAS_Sequence_Amino_acid_sequence.tsv")

    directory = os.path.join(databases_directory, "UniProt")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])

    if download:
        builder_utils.downloadDB(url, directory)

    ff = builder_utils.read_gzipped_file(file_name)
    records = builder_utils.parse_fasta(ff)
    num_entities = 0
    with open(entities_output_file, 'w', encoding='utf-8') as ef:
        ef.write('ID\theader\tsequence\tsize\tsource\n')
        with open(rel_output_file, 'w', encoding='utf-8') as rf:
            rf.write('START_ID\tEND_ID\tTYPE\tsource\n')
            for i, batch in enumerate(
                    builder_utils.batch_iterator(records, 1000)):
                for record in batch:
                    identifier = record.id.split('|')[1]
                    header = record.id
                    sequence = str(record.seq)
                    sequence_len = len(str(sequence))
                    ef.write(identifier + "\t" + header + '\t' + sequence +
                             '\t' + str(sequence_len) + '\tUniProt\n')
                    rf.write(identifier + '\t' + identifier +
                             '\tHAS_SEQUENCE\tUniProt\n')
                    num_entities += 1

    stats.add(
        builder_utils.buildStats(num_entities, "entity", "Amino_acid_sequence",
                                 "UniProt", entities_output_file, updated_on))
    stats.add(
        builder_utils.buildStats(num_entities, "relationships", "HAS_SEQUENCE",
                                 "UniProt", rel_output_file, updated_on))

    return stats
Beispiel #19
0
def ontologiesImport(ontologies=None, download=True, import_type="partial"):
    """
    Generates all the entities and relationships from the provided ontologies. If the ontologies list is\
    not provided, then all the ontologies listed in the configuration will be imported (full_import). \
    This function also updates the stats object with numbers from the imported ontologies.

    :param list ontologies: a list of ontology names to be imported.
    :param bool download: wether database is to be downloaded.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    ontologiesImportDirectory = ckg_config['imports_ontologies_directory']
    builder_utils.checkDirectory(ontologiesImportDirectory)
    stats = oh.generate_graphFiles(ontologiesImportDirectory, ontologies,
                                   download)
    statsDf = generateStatsDataFrame(stats)
    setupStats(import_type=import_type)
    writeStats(statsDf, import_type)
Beispiel #20
0
def archiveImportDirectory(archive_type="full"):
    """
    This function creates the compressed backup imports folder with either the whole folder \
    (full update) or with only the files uploaded (partial update). The folder or files are \
    compressed into a gzipped tarball file and stored in the archive/ folder defined in the \
    configuration.

    :param str archive_type: whether it is a full update or a partial update.
    """
    dest_folder = ckg_config["archive_directory"]
    builder_utils.checkDirectory(dest_folder)
    folder_to_backup = ckg_config["imports_directory"]
    date, time = builder_utils.getCurrentTime()
    file_name = "{}_{}_{}".format(archive_type, date.replace('-', ''),
                                  time.replace(':', ''))
    logger.info("Archiving {} to file: {}".format(folder_to_backup, file_name))
    builder_utils.compress_directory(folder_to_backup, dest_folder, file_name)
    logger.info("New backup created: {}".format(file_name))
Beispiel #21
0
def experimentsImport(projects=None, n_jobs=1, import_type="partial"):
    """
    Generates all the entities and relationships from the specified Projects. If the projects list is\
    not provided, then all the projects the experiments directory will be imported (full_import). \
    Calls function experimentImport.

    :param list projects:  list of project identifiers to be imported.
    :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one project.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    experiments_import_directory = ckg_config['imports_experiments_directory']
    builder_utils.checkDirectory(experiments_import_directory)
    experiments_directory = ckg_config['experiments_directory']
    if projects is None:
        projects = builder_utils.listDirectoryFolders(experiments_directory)
    if len(projects) > 0:
        Parallel(n_jobs=n_jobs)(delayed(experimentImport)(
            experiments_import_directory, experiments_directory, project)
                                for project in projects)
Beispiel #22
0
def getSTRINGMapping(source="BLAST_UniProt_AC", download=True, db="STRING"):
    """
    Parses database (db) and extracts relationships between identifiers to order databases (source).

    :param str url: link to download database raw file.
    :param str source: name of the source database for selecting aliases.
    :param bool download: wether to download the file or not.
    :param str db: name of the database to be parsed.
    :return: Dictionary of database identifers (keys) and set of unique aliases to other databases (values).
    """
    url = get_STRING_mapping_url(db=db)
    mapping = defaultdict(set)
    directory = os.path.join(dbconfig["databasesDir"], db)
    file_name = os.path.join(directory, url.split('/')[-1])
    builder_utils.checkDirectory(directory)
    if download:
        print("Downloading", url, directory)
        builder_utils.downloadDB(url, directory)

    f = os.path.join(directory, file_name)
    first = True
    with gzip.open(f, 'rb') as mf:
        for line in mf:
            if first:
                first = False
                continue
            data = line.decode('utf-8').rstrip("\r\n").split("\t")
            if db == "STRING":
                stringID = data[0]
                alias = data[1]
                sources = data[2].split(' ')
            else:
                stringID = data[0]
                alias = data[2]
                sources = data[3].split(' ')
                if not alias.startswith('DB'):
                    continue

            if source in sources:
                mapping[stringID].add(alias)

    return mapping
Beispiel #23
0
def databasesImport(databases=None,
                    n_jobs=1,
                    download=True,
                    import_type="partial"):
    """
    Generates all the entities and relationships from the provided databases. If the databases list is\
    not provided, then all the databases listed in the configuration will be imported (full_import).\
    This function also updates the stats object with numbers from the imported databases.

    :param list databases: a list of database names to be imported.
    :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one database.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    databasesImportDirectory = ckg_config['imports_databases_directory']
    builder_utils.checkDirectory(databasesImportDirectory)
    stats = dh.generateGraphFiles(databasesImportDirectory, databases,
                                  download, n_jobs)
    statsDf = generateStatsDataFrame(stats)
    setupStats(import_type=import_type)
    writeStats(statsDf, import_type)
Beispiel #24
0
def parser(databases_dir, download=True):
    config = builder_utils.get_config(config_name="goaConfig.yml",
                                      data_type='databases')
    url = config['url']
    rel_header = config['header']

    protein_mapping = mp.getMappingForEntity(entity="Protein")
    valid_proteins = list(set(protein_mapping.values))

    directory = os.path.join(databases_dir, "GOA")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    annotations = parse_annotations_with_pandas(file_name, valid_proteins)

    builder_utils.remove_directory(directory)

    return annotations, rel_header
Beispiel #25
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="reactomeConfig.yml",
                                      data_type='databases')
    urls = config['reactome_urls']
    entities = set()
    relationships = defaultdict(set)
    entities_header = config['pathway_header']
    relationships_headers = config['relationships_header']
    directory = os.path.join(databases_directory, "Reactome")
    builder_utils.checkDirectory(directory)
    metabolite_mapping = mp.getMappingForEntity("Metabolite")
    #drug_mapping = mp.getMappingForEntity("Drug")

    for dataset in urls:
        url = urls[dataset]
        file_name = url.split('/')[-1]
        if download:
            builder_utils.downloadDB(url, directory)
        f = os.path.join(directory, file_name)
        with open(f, 'r') as rf:
            if dataset == "pathway":
                entities = parsePathways(config, databases_directory, rf)
            elif dataset == "hierarchy":
                relationships[("pathway",
                               "has_parent")] = parsePathwayHierarchy(rf)
            elif dataset == "protein":
                relationships[(
                    dataset,
                    "annotated_to_pathway")] = parsePathwayRelationships(
                        config, rf)
            elif dataset == "metabolite":
                relationships[(
                    dataset,
                    "annotated_to_pathway")] = parsePathwayRelationships(
                        config, rf, metabolite_mapping)
            #elif dataset == "drug":
            #relationships[(dataset, "annotated_to_pathway")] = set()

    builder_utils.remove_directory(directory)

    return entities, relationships, entities_header, relationships_headers
Beispiel #26
0
def parser(databases_directory, download=True):
    relationships = set()
    config = builder_utils.get_config(config_name="mutationDsConfig.yml",
                                      data_type='databases')
    header = config['header']
    output_file_name = "mutation_curated_affects_interaction_with.tsv"
    regex = r":(\w+)\("
    url = config['mutations_url']
    directory = os.path.join(databases_directory, "MutationDs")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    with open(file_name, 'r') as mf:
        first = True
        for line in mf:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 12:
                internal_id = data[0]
                pvariant = '_'.join(data[1].split(':'))
                effect = data[5]
                organism = data[10]
                interaction = data[11]
                evidence = data[12]

                if organism.startswith("9606"):
                    matches = re.finditer(regex, interaction)
                    for matchNum, match in enumerate(matches, start=1):
                        interactor = match.group(1)
                        relationships.add((pvariant, interactor,
                                           "CURATED_AFFECTS_INTERACTION_WITH",
                                           effect, interaction, evidence,
                                           internal_id, "Intact-MutationDs"))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file_name)
def parser(databases_directory, download=True):
    config = builder_utils.get_config(
        config_name="drugGeneInteractionDBConfig.yml", data_type='databases')
    url = config['DGIdb_url']
    header = config['header']
    output_file = "dgidb_targets.tsv"
    drugmapping = mp.getMappingForEntity("Drug")

    relationships = set()
    directory = os.path.join(databases_directory, "DGIdb")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding='utf-8') as associations:
        first = True
        for line in associations:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            gene = data[0]
            source = data[3]
            interactionType = data[4] if data[4] != '' else 'unknown'
            drug = data[8].lower()
            if drug == "":
                drug = data[7]
                if drug == "" and data[6] != "":
                    drug = data[6]
                else:
                    continue
            if gene != "":
                if drug in drugmapping:
                    drug = drugmapping[drug]
                    relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA",
                                       interactionType, "DGIdb: " + source))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
Beispiel #28
0
def parser(databases_directory):
    directory = os.path.join(databases_directory, "PhosphoSitePlus")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="pspConfig.yml",
                                      data_type='databases')
    modifications = config['modifications']
    annotation_files = config['annotation_files']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']
    entities = set()
    relationships = defaultdict(set)
    for site_file in config['site_files']:
        file_name = os.path.join(directory, site_file)
        with gzip.open(file_name, 'r') as f:
            sites, site_relationships = parseSites(f, modifications)
            entities.update(sites)
            for r in site_relationships:
                relationships[r].update(site_relationships[r])
    for er in annotation_files:
        entity, relationship_type = er.split('-')
        file_name = os.path.join(directory, annotation_files[er])
        with gzip.open(file_name, 'r') as f:
            if entity == "disease":
                mapping = mp.getMappingFromOntology(ontology="Disease",
                                                    source=None)
                relationships[(entity, relationship_type)].update(
                    parseDiseaseAnnotations(f, modifications, mapping))
            elif entity == "biological_process":
                mapping = mp.getMappingFromOntology(ontology="Gene_ontology",
                                                    source=None)
                relationships[(entity, relationship_type)].update(
                    parseRegulationAnnotations(f, modifications, mapping))
            elif entity == "substrate":
                relationships[(entity,
                               relationship_type)] = parseKinaseSubstrates(
                                   f, modifications)

    return entities, relationships, entities_header, relationships_headers
Beispiel #29
0
def parser(databases_directory, download=True):
    directory = os.path.join(databases_directory, "ExposomeExplorer")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases')
    database_urls = config['database_urls']
    relationships_header = config['relationships_header']
    mapping = mp.getMappingForEntity("Food")
    correlations = {}
    for url in database_urls:
        zipped_fileName = os.path.join(directory, url.split('/')[-1])
        file_name = '.'.join(url.split('/')[-1].split('.')[0:2])
        if download:
            builder_utils.downloadDB(url, directory)

        with zipfile.ZipFile(zipped_fileName) as z:
            if file_name == "biomarkers.csv":
                biomarkers = parseBiomarkersFile(z, file_name)
            elif file_name == "correlations.csv":
                correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping)

    builder_utils.remove_directory(directory)

    return correlations, relationships_header
Beispiel #30
0
def generate_dataset_imports(projectId, dataType, dataset_import_dir):
    stats = set()
    builder_utils.checkDirectory(dataset_import_dir)
    try:
        if dataType in ['project', 'experimental_design', 'clinical']:
            data = clinicalParser.parser(projectId, dataType)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir)
        elif dataType in ["proteomics", "interactomics", "phosphoproteomics"]:
            data = proteomicsParser.parser(projectId, dataType)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir)
        elif dataType == "wes":
            data = wesParser.parser(projectId)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir)
        else:
            raise Exception("Error when importing experiment for project {}. Non-existing parser for data type {}".format(projectId, dataType))
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Error: {}. Experiment {}: {} file: {}, line: {}".format(err, projectId, sys.exc_info(), fname, exc_tb.tb_lineno))
        raise Exception("Error {}. Importing experiment {}. Data type {}.".format(err, projectId, dataType))