Example #1
0
def create_user_from_file(filepath, expiration):
    """
    Creates new user in the graph database and corresponding node, from an excel file. \
    Rows in the file must be users, and columns must follow set_arguments() fields.

    :param str filepath: filepath and filename containing users information.
    :param str output_file: path to output csv file.
    :param int expiration: number of days users is given access.

    .. note:: This function can be used directly with *python create_user_from_file.py -f path_to_file* .
    """
    usersImportDirectory = os.path.join(cwd, directories['usersImportDirectory'])
    usersFile = os.path.join(usersImportDirectory, uconfig['usersFile'])

    builder_utils.checkDirectory(usersImportDirectory)
    import_file = os.path.join(usersImportDirectory, uconfig['import_file'])

    data = vars(args)
    data = pd.read_excel(data['file']).applymap(str)
    create_user(data, import_file, expiration)

    if os.path.exists(usersFile):
        excel = pd.read_excel(usersFile, index=0)
        excel = excel.append(data.drop('file', axis=1), ignore_index=True)
        excel.to_excel(usersFile, index=False)
    else:
        data.to_excel(usersFile, index=False)
Example #2
0
def parser(databases_directory, importDirectory, download=True):
    config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases')
    outputfileName = "Publications.tsv"
    url = config['db_url']
    ifile = config['organisms_file']
    organisms = str(config['organisms'])
    directory = os.path.join(databases_directory, "Jensenlab")
    builder_utils.checkDirectory(os.path.join(directory, "textmining"))

    if download:
        builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining"))

    ifile = os.path.join(directory, os.path.join("textmining", ifile))
    valid_pubs = read_valid_pubs(organisms, ifile)
    entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs)
    num_entities = len(entities)
    outputfile = os.path.join(importDirectory, outputfileName)
    builder_utils.write_entities(entities, header, outputfile)
    entities = None

    for qtype in config['db_mentions_types']:
        parse_mentions(config, directory, qtype, importDirectory, download)

    builder_utils.remove_directory(os.path.join(directory, "textmining"))

    return (num_entities, outputfile)
Example #3
0
def create_user_from_command_line(args, expiration):
    """
    Creates new user in the graph database and corresponding node, from a terminal window (command line), \
    and adds the new user information to the users excel and import files. Arguments as in set_arguments().

    :param args: object. Contains all the parameters neccessary to create a user ('username', 'name', 'email', \
                'secondary_email', 'phone_number' and 'affiliation').
    :type args: any object with __dict__ attribute
    :param int expiration: number of days users is given access.

    .. note:: This function can be used directly with *python create_user_from_command_line.py -u username \
                -n user_name -e email -s secondary_email -p phone_number -a affiliation* .
    """
    usersImportDirectory = os.path.join(cwd, directories['usersImportDirectory'])
    usersFile = os.path.join(usersImportDirectory, uconfig['usersFile'])

    builder_utils.checkDirectory(usersImportDirectory)
    import_file = os.path.join(usersImportDirectory, uconfig['import_file'])

    data = vars(args)
    df = pd.DataFrame.from_dict(data, orient='index').T.drop('file', axis=1)
    create_user(df, import_file, expiration)

    if os.path.exists(usersFile):
        excel = pd.read_excel(usersFile, index=0)
        excel = excel.append(data, ignore_index=True)
        excel.to_excel(usersFile, index=False)
    else:
        df.to_excel(usersFile, index=False)
Example #4
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="smpdbConfig.yml", data_type='databases')
    urls = config['smpdb_urls']
    entities = set()
    relationships = defaultdict(set)
    entities_header = config['pathway_header']
    relationships_headers = config['relationships_header']
    directory = os.path.join(databases_directory, "SMPDB")
    builder_utils.checkDirectory(directory)

    for dataset in urls:
        url = urls[dataset]
        file_name = url.split('/')[-1]
        if download:
            builder_utils.downloadDB(url, directory)
        zipped_file = os.path.join(directory, file_name)
        with zipfile.ZipFile(zipped_file) as rf:
            if dataset == "pathway":
                entities = parsePathways(config, rf)
            elif dataset == "protein":
                relationships.update(parsePathwayProteinRelationships(rf))
            elif dataset == "metabolite":
                relationships.update(parsePathwayMetaboliteDrugRelationships(rf))

    builder_utils.remove_directory(directory)

    return entities, relationships, entities_header, relationships_headers
Example #5
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    config = builder_utils.get_config(config_name="disgenetConfig.yml",
                                      data_type='databases')

    files = config['disgenet_files']
    mapping_files = config['disgenet_mapping_files']
    url = config['disgenet_url']
    directory = os.path.join(databases_directory, "disgenet")
    builder_utils.checkDirectory(directory)
    header = config['disgenet_header']
    output_file = 'disgenet_associated_with.tsv'

    if download:
        for f in files:
            builder_utils.downloadDB(url + files[f], directory)
        for f in mapping_files:
            builder_utils.downloadDB(url + mapping_files[f], directory)

    proteinMapping = readDisGeNetProteinMapping(config, directory)
    diseaseMapping = readDisGeNetDiseaseMapping(config, directory)
    for f in files:
        first = True
        associations = gzip.open(os.path.join(directory, files[f]), 'r')
        dtype, atype = f.split('_')
        if dtype == 'gene':
            idType = "Protein"
            scorePos = 9
        if dtype == 'variant':
            idType = "Transcript"
            scorePos = 5
        for line in associations:
            if first:
                first = False
                continue
            try:
                data = line.decode('utf-8').rstrip("\r\n").split("\t")
                geneId = str(int(data[0]))
                #disease_specificity_index =  data[2]
                #disease_pleiotropy_index = data[3]
                diseaseId = data[4]
                score = float(data[scorePos])
                pmids = data[13]
                source = data[-1]
                if geneId in proteinMapping:
                    for identifier in proteinMapping[geneId]:
                        if diseaseId in diseaseMapping:
                            for code in diseaseMapping[diseaseId]:
                                code = "DOID:" + code
                                relationships[idType].add(
                                    (identifier, code, "ASSOCIATED_WITH",
                                     score, atype, "DisGeNet: " + source,
                                     pmids))
            except UnicodeDecodeError:
                continue
        associations.close()

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
Example #6
0
def parsePairs(config, databases_directory, qtype, mapping, download=True):
    url = config['db_url']
    ifile = config['db_files'][qtype]
    source = config['db_sources'][qtype]
    relationships = set()

    directory = os.path.join(databases_directory, "Jensenlab")
    builder_utils.checkDirectory(os.path.join(directory, "integration"))

    if download:
        builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "integration"))
    ifile = os.path.join(directory,os.path.join("integration", ifile))

    with open(ifile, 'r') as idbf:
        for line in idbf:
            data = line.rstrip("\r\n").split('\t')
            id1 = "9606."+data[0]
            id2 = data[2]
            score = float(data[4])

            if id1 in mapping:
                for ident in mapping[id1]:
                    relationships.add((ident, id2, "ASSOCIATED_WITH_INTEGRATED", source, score, "compiled"))
            else:
                continue

    return relationships
Example #7
0
def parser(databases_directory):
    directory = os.path.join(databases_directory, "PhosphoSitePlus")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="pspConfig.yml", data_type='databases')
    modifications = config['modifications']
    annotation_files = config['annotation_files']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']
    entities = set()
    relationships = defaultdict(set)
    for site_file in config['site_files']:
        file_name = os.path.join(directory, site_file)
        with gzip.open(file_name, 'r') as f:
            sites, site_relationships = parseSites(f, modifications)
            entities.update(sites)
            for r in site_relationships:
                relationships[r].update(site_relationships[r])
    for er in annotation_files:
        entity, relationship_type = er.split('-')
        file_name = os.path.join(directory, annotation_files[er])
        with gzip.open(file_name, 'r') as f:
            if entity == "disease":
                mapping = mp.getMappingFromOntology(ontology="Disease", source=None)
                relationships[(entity, relationship_type)].update(parseDiseaseAnnotations(f, modifications, mapping))
            elif entity == "biological_process":
                mapping = mp.getMappingFromOntology(ontology="Gene_ontology", source=None)
                relationships[(entity, relationship_type)].update(parseRegulationAnnotations(f, modifications, mapping))
            elif entity == "substrate":
                relationships[(entity, relationship_type)] = parseKinaseSubstrates(f, modifications)

    return entities, relationships, entities_header, relationships_headers
Example #8
0
def generate_dataset_imports(projectId, dataType, dataset_import_dir):
    stats = set()
    builder_utils.checkDirectory(dataset_import_dir)
    try:
        if dataType in ['project', 'experimental_design', 'clinical']:
            data = clinicalParser.parser(projectId, dataType)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId,
                                     stats, ot, dataset_import_dir)
        elif dataType in ["proteomics", "interactomics", "phosphoproteomics"]:
            data = proteomicsParser.parser(projectId, dataType)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId,
                                     stats, ot, dataset_import_dir)
        elif dataType == "wes":
            data = wesParser.parser(projectId)
            for dtype, ot in data:
                generate_graph_files(data[(dtype, ot)], dtype, projectId,
                                     stats, ot, dataset_import_dir)
        else:
            raise Exception(
                "Error when importing experiment for project {}. Non-existing parser for data type {}"
                .format(projectId, dataType))
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Error: {}. Experiment {}: {} file: {}, line: {}".format(
            err, projectId, sys.exc_info(), fname, exc_tb.tb_lineno))
        raise Exception(
            "Error {}. Importing experiment {}. Data type {}.".format(
                err, projectId, dataType))
Example #9
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="signorConfig.yml",
                                      data_type='databases')

    directory = os.path.join(databases_directory, "SIGNOR")
    builder_utils.checkDirectory(directory)

    url = config['url']
    modifications = config['modifications']
    amino_acids = config['amino_acids']
    accronyms = config['accronyms']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']

    entities = set()
    relationships = defaultdict(set)

    filename = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    entities, relationships = parse_substrates(filename, modifications,
                                               accronyms, amino_acids)

    return entities, relationships, entities_header, relationships_headers
Example #10
0
def parser(databases_directory):
    config = builder_utils.get_config(config_name="drugBankConfig.yml",
                                      data_type='databases')
    directory = os.path.join(databases_directory, "DrugBank")
    builder_utils.checkDirectory(directory)
    drugs = extract_drugs(config, directory)
    build_DrugBank_dictionary(config, directory, drugs)
    relationships = build_relationships_from_DrugBank(config, drugs)
    entities, attributes = build_drug_entity(config, drugs)
    entities_header = ['ID'] + attributes
    relationships_headers = config['relationships_headers']

    return (entities, relationships, entities_header, relationships_headers)
Example #11
0
def usersImport(importDirectory, import_type='partial'):
    """
    Generates User entities from excel file and grants access of new users to the database.
    This function also writes the relevant information to a tab-delimited file in the import \
    directory.

    :param str importDirectory: path to the directory where all the import files are generated.
    :param str import_type: type of import (´full´ or ´partial).
    """
    usersImportDirectory = os.path.join(importDirectory,
                                        uconfig['usersImportDirectory'])
    builder_utils.checkDirectory(usersImportDirectory)
    uh.parseUsersFile(usersImportDirectory, expiration=365)
Example #12
0
def save_files_in_tmp(content, dataset, prot_tool, prot_file, projectid,
                      uploaded_file):
    if dataset is not None:
        session_cookie = flask.request.cookies.get('custom-auth-session')
        temporaryDirectory = os.path.join(tmpDirectory,
                                          session_cookie + "upload")
        if not os.path.exists(tmpDirectory):
            os.makedirs(tmpDirectory)
        elif not os.path.exists(temporaryDirectory):
            os.makedirs(temporaryDirectory)

        directory = os.path.join(temporaryDirectory, dataset)
        if os.path.exists(directory) and uploaded_file is not None:
            if os.path.exists(os.path.join(directory, uploaded_file)):
                shutil.rmtree(directory)

        builder_utils.checkDirectory(directory)
        if dataset in ['proteomics', 'interactomics', 'phosphoproteomics'
                       ] and prot_tool != '' and prot_file != '':
            selected_file = prot_tool.lower() + "-" + prot_file.lower()
            if selected_file in config['file_proteomics']:
                filename = config['file_proteomics'][selected_file]
            else:
                filename = dataset + '_' + prot_tool.lower(
                ) + '_' + prot_file.replace(
                    ' ', '').lower() + '.' + uploaded_file.split('.')[-1]
            directory = os.path.join(directory, prot_tool.lower())
            if os.path.exists(directory):
                if os.path.exists(os.path.join(directory, filename)):
                    os.remove(os.path.join(directory, filename))
            builder_utils.checkDirectory(directory)
        elif dataset == 'experimental_design':
            filename = config['file_design'].split(
                '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1]
        elif dataset == 'clinical':
            filename = config['file_clinical'].split(
                '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1]

        if uploaded_file is None:
            content = None
        if content is not None:
            data = builder_utils.parse_contents(content, filename)
            builder_utils.export_contents(data, directory, filename)

            uploaded = uploaded_file
            uploaded_file = None
            return uploaded, uploaded_file, '', ''
        else:
            raise PreventUpdate

    return '', None, '', ''
Example #13
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="gwasCatalogConfig.yml",
                                      data_type='databases')
    url = config['GWASCat_url']
    entities_header = config['entities_header']
    relationships_header = config['relationships_header']
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "GWAScatalog")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding="utf-8") as catalog:
        for line in catalog:
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 36:
                pubmedid = data[1]
                date = data[3]
                title = data[6]
                sample_size = data[8]
                replication_size = data[9]
                #chromosome = data[11]
                #position = data[12]
                #genes_mapped = data[14].split(" - ")
                snp_id = data[20].split('-')[0]
                freq = data[26]
                pval = data[27]
                odds_ratio = data[30]
                trait = data[34]
                exp_factor = data[35]
                study = data[36]

                entities.add((study, "GWAS_study", title, date, sample_size,
                              replication_size, trait))
                if pubmedid != "":
                    relationships["published_in_publication"].add(
                        (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog"))
                if snp_id != "":
                    relationships["variant_found_in_gwas"].add(
                        (re.sub(r"^\W+|\W+$", "",
                                snp_id), study, "VARIANT_FOUND_IN_GWAS", freq,
                         pval, odds_ratio, trait, "GWAS Catalog"))
                if exp_factor != "":
                    exp_factor = exp_factor.split('/')[-1].replace('_', ':')
                    relationships["studies_trait"].add(
                        (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog"))

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
Example #14
0
def experimentImport(importDirectory, experimentsDirectory, project):
    """
    Generates all the entities and relationships from the specified Project. Called from function experimentsImport.

    :param str importDirectory: path to the directory where all the import files are generated.
    :param str experimentDirectory: path to the directory where all the experiments are located.
    :param str project: identifier of the project to be imported.
    """
    projectPath = os.path.join(importDirectory, project)
    builder_utils.checkDirectory(projectPath)
    projectDirectory = os.path.join(experimentsDirectory, project)
    datasets = builder_utils.listDirectoryFolders(projectDirectory)
    if 'project' in datasets:
        dataset = 'project'
        datasetPath = os.path.join(projectPath, dataset)
        builder_utils.checkDirectory(datasetPath)
        eh.generate_dataset_imports(project, dataset, datasetPath)
        datasets.remove(dataset)
        if 'experimental_design' in datasets:
            dataset = 'experimental_design'
            datasetPath = os.path.join(projectPath, dataset)
            builder_utils.checkDirectory(datasetPath)
            eh.generate_dataset_imports(project, dataset, datasetPath)
            datasets.remove(dataset)
            for dataset in datasets:
                datasetPath = os.path.join(projectPath, dataset)
                builder_utils.checkDirectory(datasetPath)
                eh.generate_dataset_imports(project, dataset, datasetPath)
Example #15
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "FooDB")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases')

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    tar_fileName = os.path.join(directory, database_url.split('/')[-1])
    if download:
        builder_utils.downloadDB(database_url, directory)

    contents = {}
    food = set()
    compounds = {}
    try:
        tf = tarfile.open(tar_fileName, 'r')
        file_content = tf.getnames()
        tar_dir = file_content[1]
        tf.extractall(path=directory)
        tf.close()
        for file_name in config['files']:
            path = os.path.join(directory, os.path.join(tar_dir, file_name))
            with open(path, 'r', encoding="utf-8", errors='replace') as f:
                if file_name == "Content.csv":
                    contents = parseContents(f)
                elif file_name == "Food.csv":
                    food, mapping = parseFood(f)
                elif file_name == "Compound.csv":
                    compounds = parseCompounds(f)
        for food_id, compound_id in contents:
            if compound_id in compounds:
                compound_code = compounds[compound_id].replace("HMDB", "HMDB00")
                relationships[("food", "has_content")].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)])
        mp.reset_mapping(entity="Food")
        with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out:
            for food_id in mapping:
                for alias in mapping[food_id]:
                    out.write(str(food_id)+"\t"+str(alias)+"\n")

        mp.mark_complete_mapping(entity="Food")
    except tarfile.ReadError as err:
        raise Exception("Error importing database FooDB.\n {}".format(err))

    builder_utils.remove_directory(directory)

    return food, relationships, entities_header, relationships_headers
Example #16
0
def fullImport(download=True, n_jobs=4):
    """
    Calls the different importer functions: Ontologies, databases, \
    experiments. The first step is to check if the stats object exists \
    and create it otherwise. Calls setupStats.
    """
    try:
        importDirectory = directories["importDirectory"]
        builder_utils.checkDirectory(importDirectory)
        setupStats(import_type='full')
        logger.info("Full import: importing all Ontologies")
        ontologiesImport(importDirectory,
                         download=download,
                         import_type='full')
        logger.info(
            "Full import: Ontologies import took {}".format(datetime.now() -
                                                            START_TIME))
        logger.info("Full import: importing all Databases")
        databasesImport(importDirectory,
                        n_jobs=n_jobs,
                        download=download,
                        import_type='full')
        logger.info(
            "Full import: Databases import took {}".format(datetime.now() -
                                                           START_TIME))
        logger.info("Full import: importing all Experiments")
        experimentsImport(n_jobs=n_jobs, import_type='full')
        logger.info(
            "Full import: Experiments import took {}".format(datetime.now() -
                                                             START_TIME))
        logger.info("Full import: importing all Users")
        usersImport(importDirectory, import_type='full')
        logger.info("Full import: Users import took {}".format(datetime.now() -
                                                               START_TIME))
    except FileNotFoundError as err:
        logger.error("Full import > {}.".format(err))
    except EOFError as err:
        logger.error("Full import > {}.".format(err))
    except IOError as err:
        logger.error("Full import > {}.".format(err))
    except IndexError as err:
        logger.error("Full import > {}.".format(err))
    except KeyError as err:
        logger.error("Full import > {}.".format(err))
    except MemoryError as err:
        logger.error("Full import > {}.".format(err))
    except Exception as err:
        logger.error("Full import > {}.".format(err))
Example #17
0
def archiveImportDirectory(archive_type="full"):
    """
    This function creates the compressed backup imports folder with either the whole folder \
    (full update) or with only the files uploaded (partial update). The folder or files are \
    compressed into a gzipped tarball file and stored in the archive/ folder defined in the \
    configuration.

    :param str archive_type: whether it is a full update or a partial update.
    """
    dest_folder = directories["archiveDirectory"]
    builder_utils.checkDirectory(dest_folder)
    folder_to_backup = directories["importDirectory"]
    date, time = builder_utils.getCurrentTime()
    file_name = "{}_{}_{}".format(archive_type, date.replace('-', ''),
                                  time.replace(':', ''))
    logger.info("Archiving {} to file: {}".format(folder_to_backup, file_name))
    builder_utils.compress_directory(folder_to_backup, dest_folder, file_name)
    logger.info("New backup created: {}".format(file_name))
Example #18
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hmdbConfig.yml",
                                      data_type='databases')
    directory = os.path.join(databases_directory, "HMDB")
    builder_utils.checkDirectory(directory)
    metabolites = extract_metabolites(config, directory, download)
    mapping = mp.getMappingFromOntology(ontology="Disease",
                                        source=config['HMDB_DO_source'])
    mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None))
    entities, attributes = build_metabolite_entity(config, directory,
                                                   metabolites)
    relationships = build_relationships_from_HMDB(config, metabolites, mapping)
    entities_header = ['ID'] + attributes
    relationships_header = config['relationships_header']

    #builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
Example #19
0
def parser(databases_directory, download=True):
    relationships = set()
    config = builder_utils.get_config(config_name="mutationDsConfig.yml",
                                      data_type='databases')
    header = config['header']
    output_file_name = "mutation_curated_affects_interaction_with.tsv"
    regex = r":(\w+)\("
    url = config['mutations_url']
    directory = os.path.join(databases_directory, "MutationDs")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    with open(file_name, 'r') as mf:
        first = True
        for line in mf:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 12:
                internal_id = data[0]
                pvariant = data[1]
                effect = data[5]
                protein = data[7].split(':')
                organism = data[10]
                interaction = data[11]
                evidence = data[12]

                if organism.startswith("9606") and len(protein) > 1:
                    protein = protein[1]
                    pvariant = protein + "_" + pvariant
                    matches = re.finditer(regex, interaction)
                    for matchNum, match in enumerate(matches, start=1):
                        interactor = match.group(1)
                        relationships.add((pvariant, interactor,
                                           "CURATED_AFFECTS_INTERACTION_WITH",
                                           effect, interaction, evidence,
                                           internal_id, "Intact-MutationDs"))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file_name)
Example #20
0
def getSTRINGMapping(source="BLAST_UniProt_AC", download=True, db="STRING"):
    """
    Parses database (db) and extracts relationships between identifiers to order databases (source).

    :param str url: link to download database raw file.
    :param str source: name of the source database for selecting aliases.
    :param bool download: wether to download the file or not.
    :param str db: name of the database to be parsed.
    :return: Dictionary of database identifers (keys) and set of unique aliases to other databases (values).
    """
    url = get_STRING_mapping_url(db=db)
    mapping = defaultdict(set)
    directory = os.path.join(dbconfig["databasesDir"], db)
    file_name = os.path.join(directory, url.split('/')[-1])
    builder_utils.checkDirectory(directory)
    print(download)
    if download:
        print("Downloading", url, directory)
        builder_utils.downloadDB(url, directory)

    f = os.path.join(directory, file_name)
    first = True
    with gzip.open(f, 'rb') as mf:
        for line in mf:
            if first:
                first = False
                continue
            data = line.decode('utf-8').rstrip("\r\n").split("\t")
            if db == "STRING":
                stringID = data[0]
                alias = data[1]
                sources = data[2].split(' ')
            else:
                stringID = data[0]
                alias = data[2]
                sources = data[3].split(' ')
                if not alias.startswith('DB'):
                    continue

            if source in sources:
                mapping[stringID].add(alias)

    return mapping
Example #21
0
def parser(databases_dir, download=True):
    config = builder_utils.get_config(config_name="goaConfig.yml",
                                      data_type='databases')
    url = config['url']
    rel_header = config['header']

    protein_mapping = mp.getMappingForEntity(entity="Protein")
    valid_proteins = list(set(protein_mapping.values))

    directory = os.path.join(databases_dir, "GOA")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    annotations = parse_annotations_with_pandas(file_name, valid_proteins)

    builder_utils.remove_directory(directory)

    return annotations, rel_header
Example #22
0
def experimentsImport(projects=None, n_jobs=1, import_type="partial"):
    """
    Generates all the entities and relationships from the specified Projects. If the projects list is\
    not provided, then all the projects the experiments directory will be imported (full_import). \
    Calls function experimentImport.

    :param list projects:  list of project identifiers to be imported.
    :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one project.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    experiments_import_directory = os.path.join(directories['importDirectory'],
                                                econfig["import_directory"])
    builder_utils.checkDirectory(experiments_import_directory)
    experiments_directory = os.path.join(directories['dataDirectory'],
                                         econfig["experiments_directory"])
    if projects is None:
        projects = builder_utils.listDirectoryFolders(experiments_directory)
    if len(projects) > 0:
        Parallel(n_jobs=n_jobs)(delayed(experimentImport)(
            experiments_import_directory, experiments_directory, project)
                                for project in projects)
Example #23
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="pathwayCommonsConfig.yml",
                                      data_type='databases')
    url = config['pathwayCommons_pathways_url']
    entities = set()
    relationships = set()
    directory = os.path.join(databases_directory, "PathwayCommons")
    builder_utils.checkDirectory(directory)
    fileName = url.split('/')[-1]
    entities_header = config['pathways_header']
    relationships_header = config['relationships_header']

    if download:
        builder_utils.downloadDB(url, directory)
    f = os.path.join(directory, fileName)
    associations = gzip.open(f, 'r')
    for line in associations:
        data = line.decode('utf-8').rstrip("\r\n").split("\t")
        linkout = data[0]
        code = data[0].split("/")[-1]
        ptw_dict = dict([item.split(": ")[0], ":".join(item.split(": ")[1:])]
                        for item in data[1].split("; "))
        proteins = data[2:]
        if "organism" in ptw_dict and ptw_dict["organism"] == "9606":
            name = ptw_dict["name"]
            source = ptw_dict["datasource"]
        else:
            continue

        entities.add((code, "Pathway", name, name, ptw_dict["organism"],
                      source, linkout))
        for protein in proteins:
            relationships.add((protein, code, "ANNOTATED_IN_PATHWAY", linkout,
                               "PathwayCommons: " + source))

    associations.close()

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
def parser(databases_directory, download=True):
    config = builder_utils.get_config(
        config_name="drugGeneInteractionDBConfig.yml", data_type='databases')
    url = config['DGIdb_url']
    header = config['header']
    output_file = "dgidb_targets.tsv"
    drugmapping = mp.getMappingForEntity("Drug")

    relationships = set()
    directory = os.path.join(databases_directory, "DGIdb")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding='utf-8') as associations:
        first = True
        for line in associations:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            gene = data[0]
            source = data[3]
            interactionType = data[4] if data[4] != '' else 'unknown'
            drug = data[8].lower()
            if drug == "":
                drug = data[7]
                if drug == "" and data[6] != "":
                    drug = data[6]
                else:
                    continue
            if gene != "":
                if drug in drugmapping:
                    drug = drugmapping[drug]
                    relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA",
                                       interactionType, "DGIdb: " + source))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
Example #25
0
def ontologiesImport(importDirectory,
                     ontologies=None,
                     download=True,
                     import_type="partial"):
    """
    Generates all the entities and relationships from the provided ontologies. If the ontologies list is\
    not provided, then all the ontologies listed in the configuration will be imported (full_import). \
    This function also updates the stats object with numbers from the imported ontologies.

    :param str importDirectory: path of the import directory where files will be created.
    :param list ontologies: a list of ontology names to be imported.
    :param bool download: wether database is to be downloaded.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    ontologiesImportDirectory = os.path.join(importDirectory,
                                             oconfig["ontologies_importDir"])
    builder_utils.checkDirectory(ontologiesImportDirectory)
    stats = oh.generate_graphFiles(ontologiesImportDirectory, ontologies,
                                   download)
    statsDf = generateStatsDataFrame(stats)
    setupStats(import_type=import_type)
    writeStats(statsDf, import_type)
Example #26
0
 def publish_analysis(self, directory):
     builder_utils.checkDirectory(directory)
     plots_directory = os.path.join(directory, 'figures')
     results_directory = os.path.join(directory, 'results')
     builder_utils.checkDirectory(plots_directory)
     builder_utils.checkDirectory(results_directory)
     self.save_analysis_plots(plots_directory)
     self.save_analysis_result(results_directory)
Example #27
0
def parser(databases_directory, download=True):
    directory = os.path.join(databases_directory, "ExposomeExplorer")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases')
    database_urls = config['database_urls']
    relationships_header = config['relationships_header']
    mapping = mp.getMappingForEntity("Food")
    correlations = {}
    for url in database_urls:
        zipped_fileName = os.path.join(directory, url.split('/')[-1])
        file_name = '.'.join(url.split('/')[-1].split('.')[0:2])
        if download:
            builder_utils.downloadDB(url, directory)

        with zipfile.ZipFile(zipped_fileName) as z:
            if file_name == "biomarkers.csv":
                biomarkers = parseBiomarkersFile(z, file_name)
            elif file_name == "correlations.csv":
                correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping)

    builder_utils.remove_directory(directory)

    return correlations, relationships_header
Example #28
0
def databasesImport(importDirectory,
                    databases=None,
                    n_jobs=1,
                    download=True,
                    import_type="partial"):
    """
    Generates all the entities and relationships from the provided databases. If the databases list is\
    not provided, then all the databases listed in the configuration will be imported (full_import).\
    This function also updates the stats object with numbers from the imported databases.

    :param str importDirectory: path of the import directory where files will be created.
    :param list databases: a list of database names to be imported.
    :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one database.
    :param str import_type: type of import (´full´ or ´partial´).
    """
    databasesImportDirectory = os.path.join(importDirectory,
                                            dbconfig["databasesImportDir"])
    builder_utils.checkDirectory(databasesImportDirectory)
    stats = dh.generateGraphFiles(databasesImportDirectory, databases,
                                  download, n_jobs)
    statsDf = generateStatsDataFrame(stats)
    setupStats(import_type=import_type)
    writeStats(statsDf, import_type)
Example #29
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hgncConfig.yml",
                                      data_type='databases')
    url = config['hgnc_url']
    entities = set()
    directory = os.path.join(databases_directory, "HGNC")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    taxid = 9606
    entities_header = config['header']

    if download:
        builder_utils.downloadDB(url, directory)

    with open(fileName, 'r', encoding="utf-8") as df:
        first = True
        for line in df:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            geneSymbol = data[1]
            geneName = data[2]
            status = data[5]
            geneFamily = data[12]
            synonyms = data[18:23]
            transcript = data[23]
            if status != "Approved":
                continue

            entities.add((geneSymbol, "Gene", geneName, geneFamily,
                          ",".join(synonyms), taxid))
            #relationships.add((geneSymbol, transcript, "TRANSCRIBED_INTO"))

    builder_utils.remove_directory(directory)

    return entities, entities_header
Example #30
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hpaConfig.yml",
                                      data_type='databases')
    url = config['hpa_pathology_url']
    disease_mapping = mp.getMappingFromOntology(ontology="Disease",
                                                source=None)
    protein_mapping = mp.getMultipleMappingForEntity("Protein")
    directory = os.path.join(databases_directory, "HPA")
    builder_utils.checkDirectory(directory)
    compressed_fileName = os.path.join(directory, url.split('/')[-1])
    file_name = '.'.join(url.split('/')[-1].split('.')[0:2])
    relationships_headers = config['relationships_headers']

    if download:
        builder_utils.downloadDB(url, directory)

    with zipfile.ZipFile(compressed_fileName) as z:
        if file_name == "pathology.tsv":
            pathology = parsePathologyFile(config, z, file_name,
                                           protein_mapping, disease_mapping)

    builder_utils.remove_directory(directory)

    return (pathology, relationships_headers)