def generate_ensembl_transcripts(ensembl_genes, build):
    """Generate a file with reduced ensembl gene information
    
    Args:
        genes(dict): A dictionary with ensembl_id as key and hgnc_id as value
        silent(bool): If genes should be written to file or not
        build(str): What build to use. Defaults to 37
    
    Yields:
        print_line(str):  Lines from the reduced file
    
    """
    build = build or '37'
    
    ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        
    ensembl_header = ['Chromosome/scaffold name', 'Gene stable ID', 
                   'Transcript stable ID', 'Transcript start (bp)', 
                   'Transcript end (bp)', 'RefSeq mRNA ID',
                   'RefSeq mRNA predicted ID', 'RefSeq ncRNA ID']
        
        
    yield '\t'.join(ensembl_header)
        
    for tx_info in parse_ensembl_transcript_request(ensembl_transcripts):
        ens_gene_id = tx_info['ensembl_gene_id']
        if ens_gene_id in ensembl_genes:
            print_line = [
                tx_info['chrom'],
                tx_info['ensembl_gene_id'],
                tx_info['ensembl_transcript_id'],
                str(tx_info['transcript_start']),
                str(tx_info['transcript_end']),
                tx_info['refseq_mrna'] or '',
                tx_info['refseq_mrna_predicted'] or '',
                tx_info['refseq_ncrna'] or '',
            ]
            yield '\t'.join(print_line)
def generate_ensembl_transcripts(ensembl_genes, build):
    """Generate a file with reduced ensembl gene information
    
    Args:
        genes(dict): A dictionary with ensembl_id as key and hgnc_id as value
        silent(bool): If genes should be written to file or not
        build(str): What build to use. Defaults to 37
    
    Yields:
        print_line(str):  Lines from the reduced file
    
    """
    build = build or '37'

    ensembl_transcripts = fetch_ensembl_transcripts(build=build)

    ensembl_header = [
        'Chromosome/scaffold name', 'Gene stable ID', 'Transcript stable ID',
        'Transcript start (bp)', 'Transcript end (bp)', 'RefSeq mRNA ID',
        'RefSeq mRNA predicted ID', 'RefSeq ncRNA ID'
    ]

    yield '\t'.join(ensembl_header)

    for tx_info in parse_ensembl_transcript_request(ensembl_transcripts):
        ens_gene_id = tx_info['ensembl_gene_id']
        if ens_gene_id in ensembl_genes:
            print_line = [
                tx_info['chrom'],
                tx_info['ensembl_gene_id'],
                tx_info['ensembl_transcript_id'],
                str(tx_info['transcript_start']),
                str(tx_info['transcript_end']),
                tx_info['refseq_mrna'] or '',
                tx_info['refseq_mrna_predicted'] or '',
                tx_info['refseq_ncrna'] or '',
            ]
            yield '\t'.join(print_line)
Exemple #3
0
def genes(build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = store

    # Fetch the omim information
    api_key = api_key or current_app.config.get('OMIM_API_KEY')
    if not api_key:
        LOG.warning(
            "Please provide a omim api key to load the omim gene panel")
        raise click.Abort()

    try:
        mim_files = fetch_mim_files(api_key,
                                    mim2genes=True,
                                    morbidmap=True,
                                    genemap2=True)
    except Exception as err:
        LOG.warning(err)
        raise click.Abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_hpo_genes()

    if build:
        builds = [build]
    else:
        builds = ['37', '38']

    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()

    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)

        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files['mim2genes'],
            genemap_lines=mim_files['genemap2'],
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)

        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    adapter.update_indexes()

    LOG.info("Genes, transcripts and Exons loaded")
Exemple #4
0
def load_transcripts(adapter,
                     transcripts_lines=None,
                     build='37',
                     ensembl_genes=None):
    """Load all the transcripts

    Transcript information is from ensembl.

    Args:
        adapter(MongoAdapter)
        transcripts_lines(iterable): iterable with ensembl transcript lines
        build(str)
        ensembl_genes(dict): Map from ensembl_id -> HgncGene

    Returns:
        transcript_objs(list): A list with all transcript objects
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)

    if transcripts_lines is None:
        transcripts_lines = fetch_ensembl_transcripts(build=build)

    # Map with all transcripts enstid -> parsed transcript
    transcripts_dict = parse_transcripts(transcripts_lines)
    for ens_tx_id in list(transcripts_dict):
        parsed_tx = transcripts_dict[ens_tx_id]
        # Get the ens gene id
        ens_gene_id = parsed_tx['ensembl_gene_id']
        # pp(ens_gene_id)
        # Fetch the internal gene object to find out the correct hgnc id
        gene_obj = ensembl_genes.get(ens_gene_id)
        # If the gene is non existing in scout we skip the transcript
        if not gene_obj:
            transcripts_dict.pop(ens_tx_id)
            LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build)
            continue

        # Add the correct hgnc id
        parsed_tx['hgnc_id'] = gene_obj['hgnc_id']
        # Primary transcript information is collected from HGNC
        parsed_tx['primary_transcripts'] = set(
            gene_obj.get('primary_transcripts', []))

    ref_seq_transcripts = 0
    nr_primary_transcripts = 0
    nr_transcripts = len(transcripts_dict)

    transcript_objs = []

    with progressbar(transcripts_dict.values(),
                     label="Building transcripts",
                     length=nr_transcripts) as bar:
        for tx_data in bar:

            #################### Get the correct refseq identifier ####################
            # We need to decide one refseq identifier for each transcript, if there are any to choose
            # from. The algorithm is as follows:
            # If these is ONE mrna this is choosen
            # If there are several mrna the one that is in 'primary_transcripts' is choosen
            # Else one is choosen at random
            # The same follows for the other categories where nc_rna has precedense over mrna_predicted
            tx_data['is_primary'] = False
            primary_transcripts = tx_data['primary_transcripts']
            refseq_identifier = None
            for category in TRANSCRIPT_CATEGORIES:
                identifiers = tx_data[category]
                if not identifiers:
                    continue

                intersection = identifiers.intersection(primary_transcripts)
                ref_seq_transcripts += 1
                if intersection:
                    refseq_identifier = intersection.pop()
                    tx_data['is_primary'] = True
                    nr_primary_transcripts += 1
                else:
                    refseq_identifier = identifiers.pop()
                # If there was refseq identifiers we break the loop
                break

            if refseq_identifier:
                tx_data['refseq_id'] = refseq_identifier
            ####################  ####################  ####################

            # Build the transcript object
            tx_obj = build_transcript(tx_data, build)
            transcript_objs.append(tx_obj)

    # Load all transcripts
    LOG.info("Loading transcripts...")
    if len(transcript_objs) > 0:
        adapter.load_transcript_bulk(transcript_objs)

    LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts)
    LOG.info('Number of transcripts with refseq identifier: %s',
             ref_seq_transcripts)
    LOG.info('Number of primary transcripts: %s', nr_primary_transcripts)

    return transcript_objs
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None):
    """Load all the transcripts

    Transcript information is from ensembl.

    Args:
        adapter(MongoAdapter)
        transcripts_lines(iterable): iterable with ensembl transcript lines
        build(str)
        ensembl_genes(dict): Map from ensembl_id -> HgncGene

    Returns:
        transcript_objs(list): A list with all transcript objects
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)

    if transcripts_lines is None:
        transcripts_lines = fetch_ensembl_transcripts(build=build)

    # Map with all transcripts enstid -> parsed transcript
    transcripts_dict = parse_transcripts(transcripts_lines)
    for ens_tx_id in list(transcripts_dict):
        parsed_tx = transcripts_dict[ens_tx_id]
        # Get the ens gene id
        ens_gene_id = parsed_tx['ensembl_gene_id']

        # Fetch the internal gene object to find out the correct hgnc id
        gene_obj = ensembl_genes.get(ens_gene_id)
        # If the gene is non existing in scout we skip the transcript
        if not gene_obj:
            transcripts_dict.pop(ens_tx_id)
            LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build)
            continue

        # Add the correct hgnc id
        parsed_tx['hgnc_id'] = gene_obj['hgnc_id']
        # Primary transcript information is collected from HGNC
        parsed_tx['primary_transcripts'] = set(gene_obj.get('primary_transcripts', []))


    ref_seq_transcripts = 0
    nr_primary_transcripts = 0
    nr_transcripts = len(transcripts_dict)

    transcript_objs = []

    with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar:
        for tx_data in bar:

            #################### Get the correct refseq identifier ####################
            # We need to decide one refseq identifier for each transcript, if there are any to 
            # choose from. The algorithm is as follows:
            # If there is ONE mrna this is choosen
            # If there are several mrna the one that is in 'primary_transcripts' is choosen
            # Else one is choosen at random
            # The same follows for the other categories where nc_rna has precedense over mrna_predicted
            # We will store all refseq identifiers in a "refseq_identifiers" list as well
            tx_data['is_primary'] = False
            primary_transcripts = tx_data['primary_transcripts']
            refseq_identifier = None
            refseq_identifiers = []
            for category in TRANSCRIPT_CATEGORIES:
                identifiers = tx_data[category]
                if not identifiers:
                    continue

                for refseq_id in identifiers:
                    # Add all refseq identifiers to refseq_identifiers
                    refseq_identifiers.append(refseq_id)
                    ref_seq_transcripts += 1

                    if refseq_id in primary_transcripts:
                        refseq_identifier = refseq_id
                        tx_data['is_primary'] = True
                        nr_primary_transcripts += 1
                    
                    if not refseq_identifier:
                        refseq_identifier = refseq_id

            if refseq_identifier:
                tx_data['refseq_id'] = refseq_identifier
            if refseq_identifiers:
                tx_data['refseq_identifiers'] = refseq_identifiers

            ####################  ####################  ####################
            # Build the transcript object
            tx_obj = build_transcript(tx_data, build)
            transcript_objs.append(tx_obj)

    # Load all transcripts
    LOG.info("Loading transcripts...")
    if len(transcript_objs) > 0:
        adapter.load_transcript_bulk(transcript_objs)

    LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts)
    LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts)
    LOG.info('Number of primary transcripts: %s', nr_primary_transcripts)

    return transcript_objs
Exemple #6
0
def setup_scout(adapter,
                institute_id='cust000',
                user_name='Clark Kent',
                user_mail='*****@*****.**',
                api_key=None,
                demo=False):
    """docstring for setup_scout"""
    ########################## Delete previous information ##########################
    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        if not collection_name.startswith('system'):
            LOG.info("Deleting collection %s", collection_name)
            adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    ########################## Add a institute ##########################
    #####################################################################
    # Build a institute with id institute_name
    institute_obj = build_institute(internal_id=institute_id,
                                    display_name=institute_id,
                                    sanger_recipients=[user_mail])

    # Add the institute to database
    adapter.add_institute(institute_obj)

    ########################## Add a User ###############################
    #####################################################################
    # Build a user obj
    user_obj = dict(_id=user_mail,
                    email=user_mail,
                    name=user_name,
                    roles=['admin'],
                    institutes=[institute_id])

    adapter.add_user(user_obj)

    ### Get the mim information ###

    if not demo:
        # Fetch the mim files
        try:
            mim_files = fetch_mim_files(api_key,
                                        mim2genes=True,
                                        morbidmap=True,
                                        genemap2=True)
        except Exception as err:
            LOG.warning(err)
            context.abort()
        mim2gene_lines = mim_files['mim2genes']
        genemap_lines = mim_files['genemap2']

        # Fetch the genes to hpo information
        hpo_gene_lines = fetch_hpo_genes()
        # Fetch the latest version of the hgnc information
        hgnc_lines = fetch_hgnc()
        # Fetch the latest exac pli score information
        exac_lines = fetch_exac_constraint()

    else:
        mim2gene_lines = [
            line for line in get_file_handle(mim2gene_reduced_path)
        ]
        genemap_lines = [
            line for line in get_file_handle(genemap2_reduced_path)
        ]

        # Fetch the genes to hpo information
        hpo_gene_lines = [
            line for line in get_file_handle(hpogenes_reduced_path)
        ]
        # Fetch the reduced hgnc information
        hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)]
        # Fetch the latest exac pli score information
        exac_lines = [line for line in get_file_handle(exac_reduced_path)]

    builds = ['37', '38']
    ################## Load Genes and transcripts #######################
    #####################################################################
    for build in builds:
        # Fetch the ensembl information
        if not demo:
            ensembl_genes = fetch_ensembl_genes(build=build)
        else:
            ensembl_genes = get_file_handle(genes37_reduced_path)
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        if not demo:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        else:
            ensembl_transcripts = get_file_handle(transcripts37_reduced_path)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    hpo_terms_handle = None
    hpo_to_genes_handle = None
    hpo_disease_handle = None
    if demo:
        hpo_terms_handle = get_file_handle(hpoterms_reduced_path)
        hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path)
        hpo_disease_handle = get_file_handle(
            hpo_phenotype_to_terms_reduced_path)

    load_hpo(adapter=adapter,
             hpo_lines=hpo_terms_handle,
             hpo_gene_lines=hpo_to_genes_handle,
             disease_lines=genemap_lines,
             hpo_disease_lines=hpo_disease_handle)

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(path=panel_path,
                                        institute='cust000',
                                        panel_id='panel1',
                                        version=1.0,
                                        display_name='Test panel')
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle)

        adapter.load_case(case_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
Exemple #7
0
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent',
                user_mail='*****@*****.**', api_key=None, demo=False):
    """docstring for setup_scout"""
    ########################## Delete previous information ##########################
    LOG.info("Deleting previous database")
    for collection_name in adapter.db.collection_names():
        if not collection_name.startswith('system'):
            LOG.info("Deleting collection %s", collection_name)
            adapter.db.drop_collection(collection_name)
    LOG.info("Database deleted")

    ########################## Add a institute ##########################
    #####################################################################
    # Build a institute with id institute_name
    institute_obj = build_institute(
        internal_id=institute_id,
        display_name=institute_id,
        sanger_recipients=[user_mail]
    )

    # Add the institute to database
    adapter.add_institute(institute_obj)

    ########################## Add a User ###############################
    #####################################################################
    # Build a user obj
    user_obj = dict(
                _id=user_mail,
                email=user_mail,
                name=user_name,
                roles=['admin'],
                institutes=[institute_id]
            )

    adapter.add_user(user_obj)

    ### Get the mim information ###

    if not demo:
        # Fetch the mim files
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise err
        mim2gene_lines = mim_files['mim2genes']
        genemap_lines = mim_files['genemap2']

        # Fetch the genes to hpo information
        hpo_gene_lines = fetch_hpo_genes()
        # Fetch the latest version of the hgnc information
        hgnc_lines = fetch_hgnc()
        # Fetch the latest exac pli score information
        exac_lines = fetch_exac_constraint()


    else:
        mim2gene_lines = [line for line in get_file_handle(mim2gene_reduced_path)]
        genemap_lines = [line for line in get_file_handle(genemap2_reduced_path)]

        # Fetch the genes to hpo information
        hpo_gene_lines = [line for line in get_file_handle(hpogenes_reduced_path)]
        # Fetch the reduced hgnc information
        hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)]
        # Fetch the latest exac pli score information
        exac_lines = [line for line in get_file_handle(exac_reduced_path)]


    builds = ['37', '38']
    ################## Load Genes and transcripts #######################
    #####################################################################
    for build in builds:
        # Fetch the ensembl information
        if not demo:
            ensembl_genes = fetch_ensembl_genes(build=build)
        else:
            ensembl_genes = get_file_handle(genes37_reduced_path)
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        if not demo:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        else:
            ensembl_transcripts = get_file_handle(transcripts37_reduced_path)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    hpo_terms_handle = None
    hpo_to_genes_handle = None
    hpo_disease_handle = None
    if demo:
        hpo_terms_handle = get_file_handle(hpoterms_reduced_path)
        hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path)
        hpo_disease_handle = get_file_handle(hpo_phenotype_to_terms_reduced_path)

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
        disease_lines=genemap_lines,
        hpo_disease_lines=hpo_disease_handle
    )

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(
            path=panel_path,
            institute='cust000',
            panel_id='panel1',
            version=1.0,
            display_name='Test panel'
        )
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle, Loader=yaml.FullLoader)

        adapter.load_case(case_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
Exemple #8
0
def genes(context, build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = context.obj['adapter']

    # Fetch the omim information
    api_key = api_key or context.obj.get('omim_api_key')
    if not api_key:
        LOG.warning("Please provide a omim api key to load the omim gene panel")
        context.abort()

    try:
        mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
    except Exception as err:
        LOG.warning(err)
        context.abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_hpo_genes()
    
    if build:
        builds = [build]
    else:
        builds = ['37', '38']
    
    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()
    
    
    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)
        
        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files['mim2genes'],
            genemap_lines=mim_files['genemap2'],
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj['ensembl_id']
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        
        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    adapter.update_indexes()
        
    LOG.info("Genes, transcripts and Exons loaded")