def generate_exac_genes(genes):
    """Generate a reduced file with omim mim2gene information
    
    Args:
        genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value
        outpath(str)

    Yields:
        print_line(str): Lines from the reduced file
    """
    exac_lines = fetch_exac_constraint()

    yield (exac_lines[0])

    for gene_info in parse_exac_genes(exac_lines):
        hgnc_symbol = gene_info.get('hgnc_symbol')
        if not hgnc_symbol:
            continue
        if hgnc_symbol in genes:
            yield gene_info['raw']
def generate_exac_genes(genes):
    """Generate a reduced file with omim mim2gene information
    
    Args:
        genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value
        outpath(str)

    Yields:
        print_line(str): Lines from the reduced file
    """
    exac_lines = fetch_exac_constraint()

    yield(exac_lines[0])
    
    for gene_info in parse_exac_genes(exac_lines):
        hgnc_symbol = gene_info.get('hgnc_symbol')
        if not hgnc_symbol:
            continue
        if hgnc_symbol in genes:
            yield gene_info['raw']
Exemple #3
0
def genes(build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = store

    # Fetch the omim information
    api_key = api_key or current_app.config.get('OMIM_API_KEY')
    if not api_key:
        LOG.warning(
            "Please provide a omim api key to load the omim gene panel")
        raise click.Abort()

    try:
        mim_files = fetch_mim_files(api_key,
                                    mim2genes=True,
                                    morbidmap=True,
                                    genemap2=True)
    except Exception as err:
        LOG.warning(err)
        raise click.Abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_hpo_genes()

    if build:
        builds = [build]
    else:
        builds = ['37', '38']

    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()

    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)

        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files['mim2genes'],
            genemap_lines=mim_files['genemap2'],
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)

        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    adapter.update_indexes()

    LOG.info("Genes, transcripts and Exons loaded")
Exemple #4
0
def setup_scout(adapter,
                institute_id='cust000',
                user_name='Clark Kent',
                user_mail='*****@*****.**',
                api_key=None,
                demo=False):
    """docstring for setup_scout"""
    ########################## Delete previous information ##########################
    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        if not collection_name.startswith('system'):
            LOG.info("Deleting collection %s", collection_name)
            adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    ########################## Add a institute ##########################
    #####################################################################
    # Build a institute with id institute_name
    institute_obj = build_institute(internal_id=institute_id,
                                    display_name=institute_id,
                                    sanger_recipients=[user_mail])

    # Add the institute to database
    adapter.add_institute(institute_obj)

    ########################## Add a User ###############################
    #####################################################################
    # Build a user obj
    user_obj = dict(_id=user_mail,
                    email=user_mail,
                    name=user_name,
                    roles=['admin'],
                    institutes=[institute_id])

    adapter.add_user(user_obj)

    ### Get the mim information ###

    if not demo:
        # Fetch the mim files
        try:
            mim_files = fetch_mim_files(api_key,
                                        mim2genes=True,
                                        morbidmap=True,
                                        genemap2=True)
        except Exception as err:
            LOG.warning(err)
            context.abort()
        mim2gene_lines = mim_files['mim2genes']
        genemap_lines = mim_files['genemap2']

        # Fetch the genes to hpo information
        hpo_gene_lines = fetch_hpo_genes()
        # Fetch the latest version of the hgnc information
        hgnc_lines = fetch_hgnc()
        # Fetch the latest exac pli score information
        exac_lines = fetch_exac_constraint()

    else:
        mim2gene_lines = [
            line for line in get_file_handle(mim2gene_reduced_path)
        ]
        genemap_lines = [
            line for line in get_file_handle(genemap2_reduced_path)
        ]

        # Fetch the genes to hpo information
        hpo_gene_lines = [
            line for line in get_file_handle(hpogenes_reduced_path)
        ]
        # Fetch the reduced hgnc information
        hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)]
        # Fetch the latest exac pli score information
        exac_lines = [line for line in get_file_handle(exac_reduced_path)]

    builds = ['37', '38']
    ################## Load Genes and transcripts #######################
    #####################################################################
    for build in builds:
        # Fetch the ensembl information
        if not demo:
            ensembl_genes = fetch_ensembl_genes(build=build)
        else:
            ensembl_genes = get_file_handle(genes37_reduced_path)
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        if not demo:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        else:
            ensembl_transcripts = get_file_handle(transcripts37_reduced_path)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    hpo_terms_handle = None
    hpo_to_genes_handle = None
    hpo_disease_handle = None
    if demo:
        hpo_terms_handle = get_file_handle(hpoterms_reduced_path)
        hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path)
        hpo_disease_handle = get_file_handle(
            hpo_phenotype_to_terms_reduced_path)

    load_hpo(adapter=adapter,
             hpo_lines=hpo_terms_handle,
             hpo_gene_lines=hpo_to_genes_handle,
             disease_lines=genemap_lines,
             hpo_disease_lines=hpo_disease_handle)

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(path=panel_path,
                                        institute='cust000',
                                        panel_id='panel1',
                                        version=1.0,
                                        display_name='Test panel')
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle)

        adapter.load_case(case_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
Exemple #5
0
def load_hgnc_genes(adapter, genes = None, ensembl_lines=None, hgnc_lines=None, exac_lines=None, mim2gene_lines=None,
                    genemap_lines=None, hpo_lines=None, build='37', omim_api_key=''):
    """Load genes into the database
        
    link_genes will collect information from all the different sources and 
    merge it into a dictionary with hgnc_id as key and gene information as values.

    Args:
        adapter(scout.adapter.MongoAdapter)
        genes(dict): If genes are already parsed
        ensembl_lines(iterable(str)): Lines formated with ensembl gene information
        hgnc_lines(iterable(str)): Lines with gene information from genenames.org
        exac_lines(iterable(str)): Lines with information pLi-scores from ExAC
        mim2gene(iterable(str)): Lines with map from omim id to gene symbol
        genemap_lines(iterable(str)): Lines with information of omim entries
        hpo_lines(iterable(str)): Lines information about map from hpo terms to genes
        build(str): What build to use. Defaults to '37'

    Returns:
        gene_objects(list): A list with all gene_objects that was loaded into database
    """
    gene_objects = list()
    
    if not genes:
        # Fetch the resources if not provided
        if ensembl_lines is None:
            ensembl_lines = fetch_ensembl_genes(build=build)
        hgnc_lines = hgnc_lines or fetch_hgnc()
        exac_lines = exac_lines or fetch_exac_constraint()
        if not (mim2gene_lines and genemap_lines):
            if not omim_api_key:
                raise SyntaxError("Need to provide omim api key")
            mim_files = fetch_mim_files(omim_api_key, mim2genes=True, genemap2=True)
            mim2gene_lines = mim_files['mim2genes']
            genemap_lines = mim_files['genemap2']
        if not hpo_lines:
            hpo_files = fetch_hpo_files(hpogenes=True)
            hpo_lines = hpo_files['hpogenes']
        
        
        # Link the resources
        genes = link_genes(
            ensembl_lines=ensembl_lines,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_lines
        )

    non_existing = 0
    nr_genes = len(genes)
    
    with progressbar(genes.values(), label="Building genes", length=nr_genes) as bar:
        for gene_data in bar:
            if not gene_data.get('chromosome'):
                LOG.debug("skipping gene: %s. No coordinates found", gene_data.get('hgnc_symbol', '?'))
                non_existing += 1
                continue
        
            gene_obj = build_hgnc_gene(gene_data, build=build)
            gene_objects.append(gene_obj)

    LOG.info("Loading genes build %s", build)
    adapter.load_hgnc_bulk(gene_objects)

    LOG.info("Loading done. %s genes loaded", len(gene_objects))
    LOG.info("Nr of genes without coordinates in build %s: %s", build,non_existing)
    
    return gene_objects
Exemple #6
0
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent',
                user_mail='*****@*****.**', api_key=None, demo=False):
    """docstring for setup_scout"""
    ########################## Delete previous information ##########################
    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        if not collection_name.startswith('system'):
            LOG.info("Deleting collection %s", collection_name)
            adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    ########################## Add a institute ##########################
    #####################################################################
    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_id,
        display_name=institute_id,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    ########################## Add a User ###############################
    #####################################################################
    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_id]
            )

    adapter.add_user(user_obj)

    ### Get the mim information ###

    if not demo:
        # Fetch the mim files
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise err
        mim2gene_lines = mim_files['mim2genes']
        genemap_lines = mim_files['genemap2']

        # Fetch the genes to hpo information
        hpo_gene_lines = fetch_hpo_genes()
        # Fetch the latest version of the hgnc information
        hgnc_lines = fetch_hgnc()
        # Fetch the latest exac pli score information
        exac_lines = fetch_exac_constraint()


    else:
        mim2gene_lines = [line for line in get_file_handle(mim2gene_reduced_path)]
        genemap_lines = [line for line in get_file_handle(genemap2_reduced_path)]

        # Fetch the genes to hpo information
        hpo_gene_lines = [line for line in get_file_handle(hpogenes_reduced_path)]
        # Fetch the reduced hgnc information
        hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)]
        # Fetch the latest exac pli score information
        exac_lines = [line for line in get_file_handle(exac_reduced_path)]


    builds = ['37', '38']
    ################## Load Genes and transcripts #######################
    #####################################################################
    for build in builds:
        # Fetch the ensembl information
        if not demo:
            ensembl_genes = fetch_ensembl_genes(build=build)
        else:
            ensembl_genes = get_file_handle(genes37_reduced_path)
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        if not demo:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        else:
            ensembl_transcripts = get_file_handle(transcripts37_reduced_path)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    hpo_terms_handle = None
    hpo_to_genes_handle = None
    hpo_disease_handle = None
    if demo:
        hpo_terms_handle = get_file_handle(hpoterms_reduced_path)
        hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path)
        hpo_disease_handle = get_file_handle(hpo_phenotype_to_terms_reduced_path)

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
        disease_lines=genemap_lines,
        hpo_disease_lines=hpo_disease_handle
    )

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(
            path=panel_path,
            institute='cust000',
            panel_id='panel1',
            version=1.0,
            display_name='Test panel'
        )
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle, Loader=yaml.FullLoader)

        adapter.load_case(case_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
Exemple #7
0
def genes(context, build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = context.obj['adapter']

    # Fetch the omim information
    api_key = api_key or context.obj.get('omim_api_key')
    if not api_key:
        LOG.warning("Please provide a omim api key to load the omim gene panel")
        context.abort()

    try:
        mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
    except Exception as err:
        LOG.warning(err)
        context.abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_hpo_genes()
    
    if build:
        builds = [build]
    else:
        builds = ['37', '38']
    
    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()
    
    
    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)
        
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files['mim2genes'],
            genemap_lines=mim_files['genemap2'],
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        
        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    adapter.update_indexes()
        
    LOG.info("Genes, transcripts and Exons loaded")