def link_genes( ensembl_lines, hgnc_lines, exac_lines, hpo_lines, mim2gene_lines=None, genemap_lines=None, ): """Gather information from different sources and return a gene dict Extract information collected from a number of sources and combine them into a gene dict with HGNC symbols as keys. hgnc_id works as the primary symbol and it is from this source we gather as much information as possible (hgnc_complete_set.txt) Coordinates are gathered from ensemble and the entries are linked from hgnc to ensembl via ENSGID. From exac the gene intolerance scores are collected, genes are linked to hgnc via hgnc symbol. This is a unstable symbol since they often change. Args: ensembl_lines(iterable(str)): Strings with ensembl gene information hgnc_lines(iterable(str)): Strings with hgnc gene information exac_lines(iterable(str)): Strings with exac PLi score info mim2gene_lines(iterable(str)) genemap_lines(iterable(str)) hpo_lines(iterable(str)): Strings with hpo gene information Yields: gene(dict): A dictionary with gene information """ genes = {} LOG.info("Linking genes") # HGNC genes are the main source, these define the gene dataset to use # Try to use as much information as possible from hgnc for hgnc_gene in parse_hgnc_genes(hgnc_lines): hgnc_id = hgnc_gene["hgnc_id"] genes[hgnc_id] = hgnc_gene add_ensembl_info(genes, ensembl_lines) symbol_to_id = genes_by_alias(genes) add_exac_info(genes, symbol_to_id, exac_lines) add_incomplete_penetrance(genes, symbol_to_id, hpo_lines) if mim2gene_lines and genemap_lines: add_omim_info(genes, symbol_to_id, genemap_lines, mim2gene_lines) return genes
def link_genes(ensembl_lines, hgnc_lines, exac_lines, mim2gene_lines, genemap_lines, hpo_lines): """Gather information from different sources and return a gene dict Extract information collected from a number of sources and combine them into a gene dict with HGNC symbols as keys. hgnc_id works as the primary symbol and it is from this source we gather as much information as possible (hgnc_complete_set.txt) Coordinates are gathered from ensemble and the entries are linked from hgnc to ensembl via ENSGID. From exac the gene intolerance scores are collected, genes are linked to hgnc via hgnc symbol. This is a unstable symbol since they often change. Args: ensembl_lines(iterable(str)): Strings with ensembl gene information hgnc_lines(iterable(str)): Strings with hgnc gene information exac_lines(iterable(str)): Strings with exac PLi score info mim2gene_lines(iterable(str)) genemap_lines(iterable(str)) hpo_lines(iterable(str)): Strings with hpo gene information Yields: gene(dict): A dictionary with gene information """ genes = {} LOG.info("Linking genes") # HGNC genes are the main source, these define the gene dataset to use # Try to use as much information as possible from hgnc for hgnc_gene in parse_hgnc_genes(hgnc_lines): hgnc_id = hgnc_gene['hgnc_id'] genes[hgnc_id] = hgnc_gene add_ensembl_info(genes, ensembl_lines) symbol_to_id = genes_by_alias(genes) add_exac_info(genes, symbol_to_id, exac_lines) add_omim_info(genes, symbol_to_id, genemap_lines, mim2gene_lines) add_incomplete_penetrance(genes, symbol_to_id, hpo_lines) return genes
def link_genes(ensembl_lines, hgnc_lines, exac_lines, mim2gene_lines, genemap_lines, hpo_lines): """Gather information from different sources and return a gene dict Extract information collected from a number of sources and combine them into a gene dict with HGNC symbols as keys. hgnc_id works as the primary symbol and it is from this source we gather as much information as possible (hgnc_complete_set.txt) Coordinates are gathered from ensemble and the entries are linked from hgnc to ensembl via ENSGID. From exac the gene intolerance scores are collected, genes are linked to hgnc via hgnc symbol. This is a unstable symbol since they often change. Args: ensembl_lines(iterable(str)) hgnc_lines(iterable(str)) exac_lines(iterable(str)) Yields: gene(dict): A dictionary with gene information """ genes = {} log.info("Linking genes and transcripts") # HGNC genes are the main source, these define the gene dataset to use # Try to use as much information as possible from hgnc for hgnc_gene in parse_hgnc_genes(hgnc_lines): hgnc_id = hgnc_gene['hgnc_id'] hgnc_gene['transcripts'] = [] genes[hgnc_id] = hgnc_gene symbol_to_id = genes_by_alias(genes) # Parse and add the ensembl gene info all_genes = {'ensembl': {}, 'symbol': {}} for transcript in parse_ensembl_transcripts(ensembl_lines): ensg_symbol = transcript['hgnc_symbol'] ensgid = transcript['ensembl_gene_id'] for id_type, gene_id in [('symbol', ensg_symbol), ('ensembl', ensgid)]: if gene_id in all_genes[id_type]: all_genes[id_type][gene_id].append(transcript) else: all_genes[id_type][gene_id] = [transcript] log.info("Add ensembl info") # Add gene coordinates and transcript info for hgnc genes: for gene_info in genes.values(): ensgid = gene_info['ensembl_gene_id'] ensg_symbol = gene_info['hgnc_symbol'] for id_type, gene_id in [('ensembl', ensgid), ('symbol', ensg_symbol)]: if gene_id: if gene_id in all_genes[id_type]: add_ensembl_info(gene_info, all_genes[id_type][gene_id]) ensgid = 'ADDED' break log.info("Add exac pli scores") for exac_gene in parse_exac_genes(exac_lines): hgnc_symbol = exac_gene['hgnc_symbol'].upper() pli_score = exac_gene['pli_score'] if hgnc_symbol in symbol_to_id: hgnc_id_info = symbol_to_id[hgnc_symbol] # If we have the true id we know ot os correct if hgnc_id_info['true_id']: hgnc_id = hgnc_id_info['true_id'] genes[hgnc_id]['pli_score'] = pli_score # Otherwise we loop over the ids and add pli score if it # is not already added else: for hgnc_id in hgnc_id_info['ids']: gene_info = genes[hgnc_id] if not gene_info.get('pli_score'): gene_info['pli_score'] = pli_score log.info("Add omim info") omim_genes = get_mim_genes(genemap_lines, mim2gene_lines) for hgnc_symbol in omim_genes: omim_info = omim_genes[hgnc_symbol] inheritance = omim_info.get('inheritance', set()) if hgnc_symbol in symbol_to_id: hgnc_id_info = symbol_to_id[hgnc_symbol] # If we have the true id we know it is correct if hgnc_id_info['true_id']: hgnc_id = hgnc_id_info['true_id'] gene_info = genes[hgnc_id] # Update the omim id to the one found in omim gene_info['omim_id'] = omim_info['mim_number'] gene_info['inheritance_models'] = list(inheritance) gene_info['phenotypes'] = omim_info.get('phenotypes', []) else: for hgnc_id in hgnc_id_info['ids']: gene_info = genes[hgnc_id] if not gene_info.get('omim_id'): gene_info['omim_id'] = omim_info['mim_number'] if not gene_info.get('inheritance_models'): gene_info['inheritance_models'] = list(inheritance) if not gene_info.get('phenotypes'): gene_info['phenotypes'] = omim_info.get('phenotypes', []) log.info("Add incomplete penetrance info") for hgnc_symbol in get_incomplete_penetrance_genes(hpo_lines): if hgnc_symbol in symbol_to_id: hgnc_id_info = symbol_to_id[hgnc_symbol] # If we have the true id we know ot os correct if hgnc_id_info['true_id']: hgnc_id = hgnc_id_info['true_id'] genes[hgnc_id]['incomplete_penetrance'] = True # Otherwise we loop over the ids and add incomplete penetrance if it # is not already added else: for hgnc_id in hgnc_id_info['ids']: gene_info = genes[hgnc_id] if not 'incomplete_penetrance' in gene_info: gene_info['incomplete_penetrance'] = True return genes
def test_parse_hgnc_genes(hgnc_handle): """Test to parse the hgnc genes""" genes = parse_hgnc_genes(lines=hgnc_handle) for gene in genes: if gene: assert gene['hgnc_id']
def hgnc_genes(request, hgnc_handle): """Get a dictionary with hgnc genes""" print('') return parse_hgnc_genes(hgnc_handle)