def test_link_genes(
    genes37_handle,
    hgnc_handle,
    exac_handle,
    mim2gene_handle,
    genemap_handle,
    hpo_genes_handle,
):
    """docstring for test_link_genes"""
    genes = link_genes(
        ensembl_lines=genes37_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_genes_handle,
    )
    for hgnc_id in genes:
        gene_obj = genes[hgnc_id]
        assert gene_obj["hgnc_symbol"]
        assert gene_obj["hgnc_id"]
        assert gene_obj["chromosome"]
        assert gene_obj["start"]
        assert gene_obj["end"]

        assert gene_obj["hgnc_symbol"] in gene_obj["previous_symbols"]
Beispiel #2
0
def genes(context, build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    adapter = context.obj['adapter']

    # Fetch the omim information
    api_key = api_key or context.obj.get('omim_api_key')
    if not api_key:
        LOG.warning("Please provide a omim api key to load the omim gene panel")
        context.abort()

    try:
        mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
    except Exception as err:
        LOG.warning(err)
        context.abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")

    hpo_genes = fetch_hpo_genes()
    
    if build:
        builds = [build]
    else:
        builds = ['37', '38']
    
    for build in builds:
        LOG.info("Loading hgnc file from {0}".format(hgnc_path))
        hgnc_handle = get_file_handle(hgnc_path)
        
        ensembl_handle = None
        if build == '37':
            ensembl_handle = get_file_handle(transcripts37_path)

        elif build == '38':
            ensembl_handle = get_file_handle(transcripts38_path)

        LOG.info("Loading exac gene file from {0}".format(exac_path))
        exac_handle = get_file_handle(exac_path)

        genes = link_genes(
            ensembl_lines=ensembl_handle,
            hgnc_lines=hgnc_handle,
            exac_lines=exac_handle,
            mim2gene_lines=mim_files['mim2genes'],
            genemap_lines=mim_files['genemap2'],
            hpo_lines=hpo_genes
        )
        
        load_hgnc_genes(adapter=adapter, genes=genes, build=build)
Beispiel #3
0
def genes(request, genes37_handle, hgnc_handle, exac_handle, mim2gene_handle,
          genemap_handle, hpo_genes_handle):
    """Get a dictionary with the linked genes"""
    print('')

    gene_dict = link_genes(ensembl_lines=genes37_handle,
                           hgnc_lines=hgnc_handle,
                           exac_lines=exac_handle,
                           mim2gene_lines=mim2gene_handle,
                           genemap_lines=genemap_handle,
                           hpo_lines=hpo_genes_handle)

    return gene_dict
Beispiel #4
0
def genes(ctx, update, build):
    """
    Load the hgnc aliases to the mongo database.
    """
    adapter = ctx.obj['adapter']

    # Test if the genes are loaded
    nr_present_genes = adapter.nr_genes(build=build)
    if nr_present_genes > 0:
        if update:
            logger.warning("Dropping all gene information")
            adapter.drop_genes()
            logger.info("Genes dropped")
        else:
            logger.info("Genes are already loaded")
            logger.info("If you wish to update genes use '--update'")
            ctx.abort()

    logger.info("Loading hgnc file from {0}".format(hgnc_path))
    hgnc_handle = get_file_handle(hgnc_path)

    if build == '37':
        logger.info("Loading ensembl transcript file from {0}".format(
                    transcripts37_path))
        ensembl_handle = get_file_handle(transcripts37_path)
    else:
        ensembl_handle = get_file_handle(transcripts38_path)

    logger.info("Loading exac gene file from {0}".format(
                exac_path))
    exac_handle = get_file_handle(exac_path)

    logger.info("Loading mim information from files {0}, {1}".format(
                mim2gene_path, genemap2_path))

    mim2gene_handle = get_file_handle(mim2gene_path)
    genemap_handle = get_file_handle(genemap2_path)
    hpo_handle = get_file_handle(hpogenes_path)

    genes = link_genes(
        ensembl_lines=ensembl_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_handle
    )

    load_hgnc_genes(adapter=adapter, genes=genes, build=build)
Beispiel #5
0
def genes(request, transcripts_file, hgnc_file, exac_file, mim2gene_file,
          genemap_file, hpo_genes_file):
    """Get a dictionary with the linked genes"""
    print('')
    transcripts_handle = get_file_handle(transcripts_file)
    hgnc_handle = get_file_handle(hgnc_file)
    exac_handle = get_file_handle(exac_file)
    mim2gene_handle = get_file_handle(mim2gene_file)
    genemap_handle = get_file_handle(genemap_file)
    hpo_genes_handle = get_file_handle(hpo_genes_file)

    gene_dict = link_genes(ensembl_lines=transcripts_handle,
                           hgnc_lines=hgnc_handle,
                           exac_lines=exac_handle,
                           mim2gene_lines=mim2gene_handle,
                           genemap_lines=genemap_handle,
                           hpo_lines=hpo_genes_handle)

    return gene_dict
def test_link_genes_no_omim(genes37_handle, hgnc_handle, exac_handle,
                            hpo_genes_handle):
    ## GIVEN gene informtation without OMIM
    ## WHEN linking the information from the different sources
    genes = link_genes(
        ensembl_lines=genes37_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        hpo_lines=hpo_genes_handle,
    )
    ## THEN assert that it works even without omim
    for hgnc_id in genes:
        gene_obj = genes[hgnc_id]
        assert gene_obj["hgnc_symbol"]
        assert gene_obj["hgnc_id"]
        assert gene_obj["chromosome"]
        assert gene_obj["start"]
        assert gene_obj["end"]

        assert gene_obj["hgnc_symbol"] in gene_obj["previous_symbols"]
def test_link_genes(genes37_handle, hgnc_handle, exac_handle, 
                    mim2gene_handle, genemap_handle, hpo_genes_handle):
    """docstring for test_link_genes"""
    genes = link_genes(
        ensembl_lines=genes37_handle, 
        hgnc_lines=hgnc_handle, 
        exac_lines=exac_handle, 
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_genes_handle,
    )
    for hgnc_id in genes:
        gene_obj = genes[hgnc_id]
        assert gene_obj['hgnc_symbol']
        assert gene_obj['hgnc_id']
        assert gene_obj['chromosome']
        assert gene_obj['start']
        assert gene_obj['end']

        assert gene_obj['hgnc_symbol'] in gene_obj['previous_symbols']
Beispiel #8
0
def load_hgnc_genes(
    adapter,
    genes=None,
    ensembl_lines=None,
    hgnc_lines=None,
    exac_lines=None,
    mim2gene_lines=None,
    genemap_lines=None,
    hpo_lines=None,
    build="37",
    omim_api_key="",
):
    """Load genes into the database

    link_genes will collect information from all the different sources and
    merge it into a dictionary with hgnc_id as key and gene information as values.

    Args:
        adapter(scout.adapter.MongoAdapter)
        genes(dict): If genes are already parsed
        ensembl_lines(iterable(str)): Lines formated with ensembl gene information
        hgnc_lines(iterable(str)): Lines with gene information from genenames.org
        exac_lines(iterable(str)): Lines with information pLi-scores from ExAC
        mim2gene(iterable(str)): Lines with map from omim id to gene symbol
        genemap_lines(iterable(str)): Lines with information of omim entries
        hpo_lines(iterable(str)): Lines information about map from hpo terms to genes
        build(str): What build to use. Defaults to '37'

    Returns:
        gene_objects(list): A list with all gene_objects that was loaded into database
    """
    gene_objects = list()

    if not genes:
        # Fetch the resources if not provided
        if ensembl_lines is None:
            ensembl_lines = fetch_ensembl_genes(build=build)
        hgnc_lines = hgnc_lines or fetch_hgnc()
        exac_lines = exac_lines or fetch_exac_constraint()
        if not (mim2gene_lines and genemap_lines):
            if not omim_api_key:
                LOG.warning("No omim api key provided!")
            else:
                mim_files = fetch_mim_files(omim_api_key,
                                            mim2genes=True,
                                            genemap2=True)
                mim2gene_lines = mim_files["mim2genes"]
                genemap_lines = mim_files["genemap2"]
        if not hpo_lines:
            hpo_files = fetch_hpo_files(hpogenes=True)
            hpo_lines = hpo_files["hpogenes"]

        # Link the resources
        genes = link_genes(
            ensembl_lines=ensembl_lines,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            hpo_lines=hpo_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
        )

    non_existing = 0
    nr_genes = len(genes)

    with progressbar(genes.values(), label="Building genes",
                     length=nr_genes) as bar:
        for gene_data in bar:
            if not gene_data.get("chromosome"):
                LOG.debug(
                    "skipping gene: %s. No coordinates found",
                    gene_data.get("hgnc_symbol", "?"),
                )
                non_existing += 1
                continue

            gene_obj = build_hgnc_gene(gene_data, build=build)
            gene_objects.append(gene_obj)

    LOG.info("Loading genes build %s", build)
    adapter.load_hgnc_bulk(gene_objects)

    LOG.info("Loading done. %s genes loaded", len(gene_objects))
    LOG.info("Nr of genes without coordinates in build %s: %s", build,
             non_existing)

    return gene_objects
Beispiel #9
0
def database(context, institute_name, user_name, user_mail):
    """Setup a scout database"""
    log.info("Running scout setup database")

    institute_name = institute_name or context.obj['institute_name']
    user_name = user_name or context.obj['user_name']
    user_mail = user_mail or context.obj['user_mail']

    adapter = context.obj['adapter']

    log.info("Setting up database %s", context.obj['mongodb'])
    log.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        log.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
    log.info("Database deleted")

    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_name,
        display_name=institute_name,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_name]
            )

    adapter.add_user(user_obj)

    # Load the genes and transcripts
    hgnc_handle = context.obj['hgnc']
    transcripts37_handle = context.obj['transcripts37']
    transcripts38_handle = context.obj['transcripts38']
    exac_handle = context.obj['exac']
    hpo_genes_handle = context.obj['hpogenes']

    mim2gene_handle = context.obj['mim2gene']
    genemap_handle = context.obj['genemap2']

    genes37 = link_genes(
        ensembl_lines=transcripts37_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_genes_handle,
    )

    load_hgnc_genes(adapter, genes37, build='37')

    genes38 = link_genes(
        ensembl_lines=transcripts38_handle,
        hgnc_lines=context.obj['hgnc38'],
        exac_lines=context.obj['exac38'],
        mim2gene_lines=context.obj['mim2gene38'],
        genemap_lines=context.obj['genemap2_38'],
        hpo_lines=context.obj['hpogenes_38'],
    )

    load_hgnc_genes(adapter, genes38, build='38')

    hpo_terms_handle = context.obj['hpo_terms']
    disease_handle = context.obj['disease_terms']
    hpo_disease_handle = context.obj['hpodiseases']

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        disease_lines=disease_handle,
        hpo_disease_lines=hpo_disease_handle
    )

    log.info("Creating indexes")

    adapter.hgnc_collection.create_index([('build', pymongo.ASCENDING),
                                          ('chromosome', pymongo.ASCENDING)])
    log.info("hgnc gene index created")

    log.info("Scout instance setup successful")
Beispiel #10
0
def demo(context):
    """Setup a scout demo instance. This instance will be populated with a
       case a gene panel and some variants.
    """
    log.info("Running scout setup demo")
    institute_name = context.obj['institute_name']
    user_name = context.obj['user_name']
    user_mail = context.obj['user_mail']

    adapter = context.obj['adapter']

    log.info("Setting up database %s", context.obj['mongodb'])
    log.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        log.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
    log.info("Database deleted")

    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_name,
        display_name=institute_name,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_name]
            )

    adapter.add_user(user_obj)

    # Load the genes and transcripts
    hgnc_handle = context.obj['hgnc']
    transcripts37_handle = context.obj['transcripts37']
    # transcripts38_handle = context.obj['transcripts38']
    exac_handle = context.obj['exac']
    hpo_genes_handle = context.obj['hpogenes']
    mim2gene_handle = context.obj['mim2gene']
    genemap_handle = context.obj['genemap2']

    genes37 = link_genes(
        ensembl_lines=transcripts37_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_genes_handle,
    )

    load_hgnc_genes(adapter, genes37, build='37')

    hpo_terms_handle = context.obj['hpo_terms']
    disease_handle = context.obj['disease_terms']
    hpo_disease_handle = context.obj['hpodiseases']

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        disease_lines=disease_handle,
        hpo_disease_lines=hpo_disease_handle
    )

    panel_info = {
            'date': datetime.datetime.now(),
            'file': panel_path,
            'type': 'clinical',
            'institute': 'cust000',
            'version': '1.0',
            'panel_name': 'panel1',
            'full_name': 'Test panel'
        }

    parsed_panel = parse_gene_panel(panel_info)
    panel_obj = build_panel(parsed_panel, adapter)
    load_panel(
        adapter=adapter,
        panel_info=panel_info
    )

    case_handle = get_file_handle(load_path)
    case_data = yaml.load(case_handle)

    case_data['vcf_snv'] = clinical_snv_path
    case_data['vcf_sv'] = clinical_sv_path
    case_data['vcf_snv_research'] = research_snv_path
    case_data['vcf_sv_research'] = research_sv_path
    case_data['madeline'] = madeline_path

    load_scout(adapter, case_data)

    log.info("Creating indexes")

    adapter.hgnc_collection.create_index([('build', pymongo.ASCENDING),
                                          ('chromosome', pymongo.ASCENDING)])
    log.info("hgnc gene index created")

    log.info("Scout demo instance setup successful")
Beispiel #11
0
def database(context, institute_name, user_name, user_mail, api_key):
    """Setup a scout database"""
    LOG.info("Running scout setup database")

    # Fetch the omim information
    api_key = api_key or context.obj.get('omim_api_key')
    if not api_key:
        LOG.warning("Please provide a omim api key to load the omim gene panel")
        context.abort()

    try:
        mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
    except Exception as err:
        LOG.warning(err)
        context.abort()
    
    # for fn in mim_files:
    #     click.echo("{0}: {1}".format(fn, type(mim_files[fn])))
    #
    # context.abort()
    
    institute_name = institute_name or context.obj['institute_name']
    user_name = user_name or context.obj['user_name']
    user_mail = user_mail or context.obj['user_mail']

    adapter = context.obj['adapter']

    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        if not collection_name.startswith('system'):
            LOG.info("Deleting collection %s", collection_name)
            adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    LOG.info("Setting up database %s", context.obj['mongodb'])

    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_name,
        display_name=institute_name,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_name]
            )

    adapter.add_user(user_obj)
    
    # Fetch the genes to hpo information
    hpo_genes = fetch_hpo_genes()
    
    # Load the genes and transcripts
    genes37 = link_genes(
        ensembl_lines=get_file_handle(transcripts37_path),
        hgnc_lines=get_file_handle(hgnc_path),
        exac_lines=get_file_handle(exac_path),
        mim2gene_lines=mim_files['mim2genes'],
        genemap_lines=mim_files['genemap2'],
        hpo_lines=hpo_genes,
    )

    load_hgnc_genes(adapter, genes37, build='37')

    genes38 = link_genes(
        ensembl_lines=get_file_handle(transcripts38_path),
        hgnc_lines=get_file_handle(hgnc_path),
        exac_lines=get_file_handle(exac_path),
        mim2gene_lines=mim_files['mim2genes'],
        genemap_lines=mim_files['genemap2'],
        hpo_lines=hpo_genes,
    )

    load_hgnc_genes(adapter, genes38, build='38')

    load_hpo(
        adapter=adapter,
        disease_lines=mim_files['genemap2'],
    )

    LOG.info("Creating indexes")
    
    adapter.load_indexes()

    LOG.info("Scout instance setup successful")
Beispiel #12
0
def demo(context):
    """Setup a scout demo instance. This instance will be populated with a
       case a gene panel and some variants.
    """
    LOG.info("Running scout setup demo")
    institute_name = context.obj['institute_name']
    user_name = context.obj['user_name']
    user_mail = context.obj['user_mail']

    adapter = context.obj['adapter']

    LOG.info("Setting up database %s", context.obj['mongodb'])
    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        LOG.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_name,
        display_name=institute_name,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_name]
            )

    adapter.add_user(user_obj)

    # Load the genes and transcripts
    LOG.info("Loading hgnc genes from %s", hgnc_reduced_path)
    hgnc_handle = get_file_handle(hgnc_reduced_path)
    hgnc38 = get_file_handle(hgnc_reduced_path)
    
    LOG.info("Loading exac genes from %s", exac_reduced_path)
    exac_handle = get_file_handle(exac_reduced_path)
    exac38 = get_file_handle(exac_reduced_path)
    
    LOG.info("Loading mim2gene info from %s", mim2gene_reduced_path)
    mim2gene_handle = get_file_handle(mim2gene_reduced_path)
    mim2gene38 = get_file_handle(mim2gene_reduced_path)
    
    LOG.info("Loading genemap info from %s", genemap2_reduced_path)
    genemap_handle = get_file_handle(genemap2_reduced_path)
    genemap38 = get_file_handle(genemap2_reduced_path)
    
    LOG.info("Loading hpo gene info from %s", hpogenes_reduced_path)
    hpo_genes_handle = get_file_handle(hpogenes_reduced_path)
    hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path)
    hpogenes38 = get_file_handle(hpogenes_reduced_path)
    LOG.info("Loading hpo disease info from %s", hpo_phenotype_to_terms_reduced_path)
    hpo_disease_handle = get_file_handle(hpo_phenotype_to_terms_reduced_path)
    LOG.info("Loading hpo terms from %s", hpoterms_reduced_path)
    hpo_terms_handle = get_file_handle(hpoterms_reduced_path)
    
    LOG.info("Loading omim disease info from %s", genemap2_reduced_path)
    disease_handle = get_file_handle(genemap2_reduced_path)
    
    LOG.info("Loading transcripts build 37 info from %s", transcripts37_reduced_path)
    transcripts37_handle = get_file_handle(transcripts37_reduced_path)
    transcripts38_handle = get_file_handle(transcripts38_reduced_path)
    

    genes37 = link_genes(
        ensembl_lines=transcripts37_handle,
        hgnc_lines=hgnc_handle,
        exac_lines=exac_handle,
        mim2gene_lines=mim2gene_handle,
        genemap_lines=genemap_handle,
        hpo_lines=hpo_genes_handle,
    )

    load_hgnc_genes(adapter, genes37, build='37')

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
        disease_lines=disease_handle,
        hpo_disease_lines=hpo_disease_handle
    )

    adapter.load_panel(
        path=panel_path, 
        institute='cust000', 
        panel_id='panel1', 
        date=datetime.datetime.now(), 
        panel_type='clinical', 
        version=1.0, 
        display_name='Test panel'
    )

    case_handle = get_file_handle(load_path)
    case_data = yaml.load(case_handle)
    
    adapter.load_case(case_data)

    LOG.info("Creating indexes")

    adapter.load_indexes()

    LOG.info("Scout demo instance setup successful")