Exemple #1
0
def update_textmining(user='******', download=True, n_jobs=3):
    logger.info("The user {} chose to perform an update of the text mining".format(user))
    logger.info("Updating text mining > step 1: Importing data from mentions")
    importer.databasesImport(databases=['mentions'], n_jobs=n_jobs, download=download)
    logger.info("Updating text mining > step 2: Loading updated mentions into the database")
    loader.partialUpdate(imports=['mentions'], specific=[])

    return True
Exemple #2
0
def create_new_project(driver, projectId, data, separator='|'):
    """
    Creates a new project in the graph database, following the steps:

    1. Retrieves new project external identifier and creates project node and relationships in the graph database.
    2. Creates subjects, timepoints and intervention nodes.
    3. Saves all the entities and relationships to tab-delimited files.
    4. Returns the number of projects created and the project external identifier.

    :param driver: neo4j driver, which provides the connection to the neo4j graph database.
    :type driver: neo4j driver
    :param str projectId: internal project identifier (CPxxxxxxxxxxxx).
    :param data: pandas Dataframe with project as row and other attributes as columns.
    :param str separator: character used to separate multiple entries in a project attribute.
    :return: Two strings: number of projects created and the project external identifier.
    """
    query_name = 'create_project'
    external_identifier = None
    done = None
    try:
        db_project = check_if_node_exists(driver, 'Project', 'name',
                                          data['name'][0])
        if db_project.empty:
            external_identifier = get_new_project_identifier(driver, projectId)
            if external_identifier is None:
                external_identifier = 'P0000001'
            data['external_id'] = external_identifier

            projectDir = os.path.join(
                ckg_config['experiments_directory'],
                os.path.join(external_identifier, 'project'))
            ckg_utils.checkDirectory(projectDir)
            data.to_excel(os.path.join(
                projectDir, 'ProjectData_{}.xlsx'.format(external_identifier)),
                          index=False,
                          encoding='utf-8')

            datasetPath = os.path.join(
                os.path.join(ckg_config['imports_experiments_directory'],
                             external_identifier), 'project')
            ckg_utils.checkDirectory(datasetPath)
            eh.generate_dataset_imports(external_identifier, 'project',
                                        datasetPath)
            loader.partialUpdate(imports=['project'],
                                 specific=[external_identifier])
            done = 1
        else:
            done = 0
            external_identifier = ''
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Reading query {}: {}, file: {},line: {}, err: {}".format(
            query_name, sys.exc_info(), fname, exc_tb.tb_lineno, err))
    return done, external_identifier
Exemple #3
0
def run_minimal_update(user, n_jobs=3):
    licensed_dbs = ['phosphositeplus', 'drugbank']
    licensed_ont = ['Clinical_variable']
    mapping_ont = ['Disease', 'Gene_ontology', 'Experimental_factor']
    minimal_load = ['ontologies', 'modified_proteins', 'drugs', 'mentions', 'side effects', 'clinical_variants', 'project', 'experiment']
    logger.info("The user {} chose to perform a minimal build, after creating the database from a dump".format(user))
    logger.info("Building database > step 1: Importing licensed ontologies and databases")
    importer.ontologiesImport(ontologies=licensed_ont, download=False)
    importer.ontologiesImport(ontologies=mapping_ont, download=True)
    importer.databasesImport(databases=licensed_dbs, n_jobs=n_jobs, download=False)
    logger.info("Building database > step 2: Loading all missing nodes and entities")
    loader.partialUpdate(imports=minimal_load, specific=[])

    return True
Exemple #4
0
def run_processing(n_clicks, project_id):
    message = None
    style = {'display': 'none'}
    table = None

    if n_clicks > 0:
        session_cookie = flask.request.cookies.get('custom-auth-session')
        destDir = os.path.join(ckg_config['experiments_directory'], project_id)
        builder_utils.checkDirectory(destDir)
        temporaryDirectory = os.path.join(ckg_config['tmp_directory'],
                                          session_cookie + "upload")
        datasets = builder_utils.listDirectoryFoldersNotEmpty(
            temporaryDirectory)
        driver = connector.getGraphDatabaseConnectionConfiguration()
        if driver is not None:
            res_n = dataUpload.check_samples_in_project(driver, project_id)
            if 'experimental_design' in datasets:
                dataset = 'experimental_design'
                directory = os.path.join(temporaryDirectory, dataset)
                destination = os.path.join(destDir, dataset)
                experimental_files = os.listdir(directory)
                regex = r"{}.+".format(config['file_design'].replace(
                    'PROJECTID', project_id))
                r = re.compile(regex)
                experimental_filename = list(
                    filter(r.match, experimental_files))
                if len(experimental_filename) > 0:
                    experimental_filename = experimental_filename.pop()
                    designData = builder_utils.readDataset(
                        os.path.join(directory, experimental_filename))
                    designData = designData.astype(str)
                    designData.columns = [
                        c.lower() for c in designData.columns
                    ]
                    if 'subject external_id' in designData.columns and 'biological_sample external_id' in designData.columns and 'analytical_sample external_id' in designData.columns:
                        if (res_n > 0).any().values.sum() > 0:
                            res = dataUpload.remove_samples_nodes_db(
                                driver, project_id)
                            res_n = dataUpload.check_samples_in_project(
                                driver, project_id)
                            if (res_n > 0).any().values.sum() > 0:
                                message = 'ERROR: There is already an experimental design loaded into the database and there was an error when trying to delete it. Contact your administrator.'
                                return message, style, style, table

                        res_n = None
                        result = create_new_identifiers.apply_async(
                            args=[
                                project_id,
                                designData.to_json(), directory,
                                experimental_filename
                            ],
                            task_id='data_upload_' + session_cookie +
                            datetime.now().strftime('%Y%m-%d%H-%M%S-'),
                            queue='creation')
                        result_output = result.wait(timeout=None,
                                                    propagate=True,
                                                    interval=0.2)
                        res_n = pd.DataFrame.from_dict(result_output['res_n'])
                        builder_utils.copytree(directory, destination)
                    else:
                        message = 'ERROR: The Experimental design file provided ({}) is missing some of the required fields: {}'.format(
                            experimental_filename, ','.join([
                                'subject external_id',
                                'biological_sample external_id',
                                'analytical_sample external_id'
                            ]))
                        builder_utils.remove_directory(directory)

                        return message, style, style, table

            if 'clinical' in datasets:
                dataset = 'clinical'
                directory = os.path.join(temporaryDirectory, dataset)
                clinical_files = os.listdir(directory)
                regex = r"{}.+".format(config['file_clinical'].replace(
                    'PROJECTID', project_id))
                r = re.compile(regex)
                clinical_filename = list(filter(r.match, clinical_files))
                if len(clinical_filename) > 0:
                    clinical_filename = clinical_filename.pop()
                    data = builder_utils.readDataset(
                        os.path.join(directory, clinical_filename))
                    data.columns = [c.lower() for c in data.columns]
                    external_ids = {}
                    if 'subject external_id' in data and 'biological_sample external_id' in data:
                        external_ids['subjects'] = data[
                            'subject external_id'].astype(
                                str).unique().tolist()
                        external_ids['biological_samples'] = data[
                            'biological_sample external_id'].astype(
                                str).unique().tolist()
                        dataUpload.create_mapping_cols_clinical(
                            driver,
                            data,
                            directory,
                            clinical_filename,
                            separator=separator)
                        if 0 in res_n.values:
                            samples = ', '.join(
                                [k for (k, v) in res_n if v == 0])
                            message = 'ERROR: No {} for project {} in the database. Please upload first the experimental design (ExperimentalDesign_{}.xlsx)'.format(
                                samples, project_id, project_id)
                            builder_utils.remove_directory(directory)

                            return message, style, style, table
                        else:
                            db_ids = dataUpload.check_external_ids_in_db(
                                driver, project_id).to_dict()
                            message = ''
                            intersections = {}
                            differences_in = {}
                            differences_out = {}
                            for col in external_ids:
                                intersect = list(
                                    set(db_ids[col].values()).intersection(
                                        external_ids[col]))
                                difference_in = list(
                                    set(db_ids[col].values()).difference(
                                        external_ids[col]))
                                difference_out = list(
                                    set(external_ids[col]).difference(
                                        set(db_ids[col].values())))
                                if len(difference_in) > 0 or len(
                                        difference_out) > 0:
                                    intersections[col] = intersect
                                    differences_in[col] = difference_in
                                    differences_out[col] = difference_out
                            for col in intersections:
                                message += 'WARNING: Some {} identifiers were not matched:\n Matching: {}\n No information provided: {} \n Non-existing in the database: {}\n'.format(
                                    col, len(intersections[col]),
                                    ','.join(differences_in[col]),
                                    ','.join(differences_out[col]))
                    else:
                        message = 'ERROR: Format of the Clinical Data file is not correct. Check template in the documentation. Check columns: subject external_id, biological_sample external_id and analytical_sample external_id'
                        builder_utils.remove_directory(directory)

                        return message, style, style, table
            try:
                for dataset in datasets:
                    if dataset != "experimental_design":
                        source = os.path.join(temporaryDirectory, dataset)
                        destination = os.path.join(destDir, dataset)
                        builder_utils.copytree(source, destination)
                        datasetPath = os.path.join(
                            os.path.join(
                                ckg_config['imports_experiments_directory'],
                                project_id), dataset)
                        eh.generate_dataset_imports(project_id, dataset,
                                                    datasetPath)

                loader.partialUpdate(imports=['experiment'],
                                     specific=[project_id])
                filename = os.path.join(ckg_config['tmp_directory'],
                                        'Uploaded_files_' + project_id)
                utils.compress_directory(filename,
                                         temporaryDirectory,
                                         compression_format='zip')
                style.update({'display': 'inline-block'})
                message = 'Files successfully uploaded.'
                table = dataUpload.get_project_information(driver, project_id)
                if table is None:
                    message = 'Error: No data was uploaded for project: {}. Review your experimental design and data files.'.format(
                        project_id)
            except Exception as err:
                style.update({'display': 'none'})
                message = str(err)
        else:
            style.update({'display': 'none'})
            message = "ERROR: Database is offline. Contact your administrator or start the database."

    return message, style, style, table
Exemple #5
0
def main():
    parser = set_arguments()
    args = parser.parse_args()
    download = str(args.download).lower() == "true"
    if args.build_type == 'full':
        run_full_update(args.user, args.n_jobs, download)
    elif args.build_type == 'minimal':
        run_minimal_update(args.user, args.n_jobs)
    elif args.build_type == 'import':
        logger.info("The user chose to perform a partial build")
        if args.import_types is not None:
            if args.data is None or len(args.data) > 0:
                logger.info("The build will import data from {}".format("".join(args.import_types)))
                for import_type in args.import_types:
                    logger.info("Importing {}: {}".format(import_type, args.data))
                    if import_type.lower() == 'experiments' or import_type.lower() == 'experiment':
                        importer.experimentsImport(projects=args.data, n_jobs=1)
                    elif import_type.lower() == 'users' or import_type.lower() == 'user':
                        importer.usersImport(importDirectory=directories['importDirectory'])
                    elif import_type.lower() == 'databases' or import_type.lower() == 'database':
                        databases = [d.lower() for d in dbconfig['databases']]
                        if args.data is not None:
                            valid_entities = [x.lower() for x in args.data if x.lower() in databases]
                        else:
                            valid_entities = databases
                        if len(valid_entities) > 0:
                            logger.info("These entities will be imported: {}".format(", ".join(valid_entities)))
                            print("These entities will be imported: {}".format(", ".join(valid_entities)))
                            importer.databasesImport(databases=valid_entities, n_jobs=args.n_jobs, download=download)
                        else:
                            logger.error("The indicated entities (--data) cannot be imported: {}".format(args.data))
                            print("The indicated entities (--data) cannot be imported: {}".format(args.data))
                    elif import_type.lower() == 'ontologies' or import_type.lower() == 'ontology':
                        ontologies = [d.lower() for d in oconfig['ontologies']]
                        if args.data is not None:
                            valid_entities = [x.capitalize() for x in args.data if x.lower() in ontologies]
                        else:
                            valid_entities = ontologies
                        if len(valid_entities) > 0:
                            logger.info("These entities will be imported: {}".format(", ".join(valid_entities)))
                            print("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
                            importer.ontologiesImport(ontologies=valid_entities, download=download)
                        else:
                            logger.error("The indicated entities (--data) cannot be imported: {}".format(args.data))
                            print("The indicated entities (--data) cannot be imported: {}".format(args.data))
            else:
                print("Indicate the data to be imported by passing the argument --data and the list to be imported. \
                                Example: python builder.py --build_type import --import_types databases --data UniProt")
    elif args.build_type == 'load':
        logger.info("The build will load data into the database: {}".format("".join(args.load_entities)))
        valid_entities = []
        specific = args.specific
        if len(args.load_entities) > 0:
            valid_entities = [x.lower() for x in args.load_entities if x.lower() in config['graph']]
        else:
            valid_entities = config['graph']
        if len(valid_entities) > 0:
            logger.info("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
            print("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
            loader.partialUpdate(imports=valid_entities, specific=specific)
        else:
            logger.error("The indicated entities (--load_entities) cannot be loaded: {}".format(args.load_entities))
            print("The indicated entities (--load_entities) cannot be loaded into the database: {}".format(args.load_entities))
    else:
        print("Indicate the type of build you want to perform, either import (generate csv files to be loaded into the database), \
                                    load (load csv files into the database) or full (import and then load all the data into the database) \
                                    Example: Import > python builder.py --build_type import --import_types databases --data UniProt\n \
                                    Load > python builder.py --build_type load --load_types Mentions\n \
                                    Full > python builder.py --build_type full or simpy python builder.py")