def generate_ensembl_transcripts(ensembl_genes, build): """Generate a file with reduced ensembl gene information Args: genes(dict): A dictionary with ensembl_id as key and hgnc_id as value silent(bool): If genes should be written to file or not build(str): What build to use. Defaults to 37 Yields: print_line(str): Lines from the reduced file """ build = build or '37' ensembl_transcripts = fetch_ensembl_transcripts(build=build) ensembl_header = ['Chromosome/scaffold name', 'Gene stable ID', 'Transcript stable ID', 'Transcript start (bp)', 'Transcript end (bp)', 'RefSeq mRNA ID', 'RefSeq mRNA predicted ID', 'RefSeq ncRNA ID'] yield '\t'.join(ensembl_header) for tx_info in parse_ensembl_transcript_request(ensembl_transcripts): ens_gene_id = tx_info['ensembl_gene_id'] if ens_gene_id in ensembl_genes: print_line = [ tx_info['chrom'], tx_info['ensembl_gene_id'], tx_info['ensembl_transcript_id'], str(tx_info['transcript_start']), str(tx_info['transcript_end']), tx_info['refseq_mrna'] or '', tx_info['refseq_mrna_predicted'] or '', tx_info['refseq_ncrna'] or '', ] yield '\t'.join(print_line)
def generate_ensembl_transcripts(ensembl_genes, build): """Generate a file with reduced ensembl gene information Args: genes(dict): A dictionary with ensembl_id as key and hgnc_id as value silent(bool): If genes should be written to file or not build(str): What build to use. Defaults to 37 Yields: print_line(str): Lines from the reduced file """ build = build or '37' ensembl_transcripts = fetch_ensembl_transcripts(build=build) ensembl_header = [ 'Chromosome/scaffold name', 'Gene stable ID', 'Transcript stable ID', 'Transcript start (bp)', 'Transcript end (bp)', 'RefSeq mRNA ID', 'RefSeq mRNA predicted ID', 'RefSeq ncRNA ID' ] yield '\t'.join(ensembl_header) for tx_info in parse_ensembl_transcript_request(ensembl_transcripts): ens_gene_id = tx_info['ensembl_gene_id'] if ens_gene_id in ensembl_genes: print_line = [ tx_info['chrom'], tx_info['ensembl_gene_id'], tx_info['ensembl_transcript_id'], str(tx_info['transcript_start']), str(tx_info['transcript_end']), tx_info['refseq_mrna'] or '', tx_info['refseq_mrna_predicted'] or '', tx_info['refseq_ncrna'] or '', ] yield '\t'.join(print_line)
def genes(build, api_key): """ Load the hgnc aliases to the mongo database. """ LOG.info("Running scout update genes") adapter = store # Fetch the omim information api_key = api_key or current_app.config.get('OMIM_API_KEY') if not api_key: LOG.warning( "Please provide a omim api key to load the omim gene panel") raise click.Abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) raise click.Abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") LOG.warning("Dropping all transcript information") adapter.drop_transcripts(build) LOG.info("transcripts dropped") hpo_genes = fetch_hpo_genes() if build: builds = [build] else: builds = ['37', '38'] hgnc_lines = fetch_hgnc() exac_lines = fetch_exac_constraint() for build in builds: ensembl_genes = fetch_ensembl_genes(build=build) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, build=build, ) ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl ensembl_transcripts = fetch_ensembl_transcripts(build=build) transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) adapter.update_indexes() LOG.info("Genes, transcripts and Exons loaded")
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None): """Load all the transcripts Transcript information is from ensembl. Args: adapter(MongoAdapter) transcripts_lines(iterable): iterable with ensembl transcript lines build(str) ensembl_genes(dict): Map from ensembl_id -> HgncGene Returns: transcript_objs(list): A list with all transcript objects """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) if transcripts_lines is None: transcripts_lines = fetch_ensembl_transcripts(build=build) # Map with all transcripts enstid -> parsed transcript transcripts_dict = parse_transcripts(transcripts_lines) for ens_tx_id in list(transcripts_dict): parsed_tx = transcripts_dict[ens_tx_id] # Get the ens gene id ens_gene_id = parsed_tx['ensembl_gene_id'] # pp(ens_gene_id) # Fetch the internal gene object to find out the correct hgnc id gene_obj = ensembl_genes.get(ens_gene_id) # If the gene is non existing in scout we skip the transcript if not gene_obj: transcripts_dict.pop(ens_tx_id) LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build) continue # Add the correct hgnc id parsed_tx['hgnc_id'] = gene_obj['hgnc_id'] # Primary transcript information is collected from HGNC parsed_tx['primary_transcripts'] = set( gene_obj.get('primary_transcripts', [])) ref_seq_transcripts = 0 nr_primary_transcripts = 0 nr_transcripts = len(transcripts_dict) transcript_objs = [] with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar: for tx_data in bar: #################### Get the correct refseq identifier #################### # We need to decide one refseq identifier for each transcript, if there are any to choose # from. The algorithm is as follows: # If these is ONE mrna this is choosen # If there are several mrna the one that is in 'primary_transcripts' is choosen # Else one is choosen at random # The same follows for the other categories where nc_rna has precedense over mrna_predicted tx_data['is_primary'] = False primary_transcripts = tx_data['primary_transcripts'] refseq_identifier = None for category in TRANSCRIPT_CATEGORIES: identifiers = tx_data[category] if not identifiers: continue intersection = identifiers.intersection(primary_transcripts) ref_seq_transcripts += 1 if intersection: refseq_identifier = intersection.pop() tx_data['is_primary'] = True nr_primary_transcripts += 1 else: refseq_identifier = identifiers.pop() # If there was refseq identifiers we break the loop break if refseq_identifier: tx_data['refseq_id'] = refseq_identifier #################### #################### #################### # Build the transcript object tx_obj = build_transcript(tx_data, build) transcript_objs.append(tx_obj) # Load all transcripts LOG.info("Loading transcripts...") if len(transcript_objs) > 0: adapter.load_transcript_bulk(transcript_objs) LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts) LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts) LOG.info('Number of primary transcripts: %s', nr_primary_transcripts) return transcript_objs
def load_transcripts(adapter, transcripts_lines=None, build='37', ensembl_genes=None): """Load all the transcripts Transcript information is from ensembl. Args: adapter(MongoAdapter) transcripts_lines(iterable): iterable with ensembl transcript lines build(str) ensembl_genes(dict): Map from ensembl_id -> HgncGene Returns: transcript_objs(list): A list with all transcript objects """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) if transcripts_lines is None: transcripts_lines = fetch_ensembl_transcripts(build=build) # Map with all transcripts enstid -> parsed transcript transcripts_dict = parse_transcripts(transcripts_lines) for ens_tx_id in list(transcripts_dict): parsed_tx = transcripts_dict[ens_tx_id] # Get the ens gene id ens_gene_id = parsed_tx['ensembl_gene_id'] # Fetch the internal gene object to find out the correct hgnc id gene_obj = ensembl_genes.get(ens_gene_id) # If the gene is non existing in scout we skip the transcript if not gene_obj: transcripts_dict.pop(ens_tx_id) LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build) continue # Add the correct hgnc id parsed_tx['hgnc_id'] = gene_obj['hgnc_id'] # Primary transcript information is collected from HGNC parsed_tx['primary_transcripts'] = set(gene_obj.get('primary_transcripts', [])) ref_seq_transcripts = 0 nr_primary_transcripts = 0 nr_transcripts = len(transcripts_dict) transcript_objs = [] with progressbar(transcripts_dict.values(), label="Building transcripts", length=nr_transcripts) as bar: for tx_data in bar: #################### Get the correct refseq identifier #################### # We need to decide one refseq identifier for each transcript, if there are any to # choose from. The algorithm is as follows: # If there is ONE mrna this is choosen # If there are several mrna the one that is in 'primary_transcripts' is choosen # Else one is choosen at random # The same follows for the other categories where nc_rna has precedense over mrna_predicted # We will store all refseq identifiers in a "refseq_identifiers" list as well tx_data['is_primary'] = False primary_transcripts = tx_data['primary_transcripts'] refseq_identifier = None refseq_identifiers = [] for category in TRANSCRIPT_CATEGORIES: identifiers = tx_data[category] if not identifiers: continue for refseq_id in identifiers: # Add all refseq identifiers to refseq_identifiers refseq_identifiers.append(refseq_id) ref_seq_transcripts += 1 if refseq_id in primary_transcripts: refseq_identifier = refseq_id tx_data['is_primary'] = True nr_primary_transcripts += 1 if not refseq_identifier: refseq_identifier = refseq_id if refseq_identifier: tx_data['refseq_id'] = refseq_identifier if refseq_identifiers: tx_data['refseq_identifiers'] = refseq_identifiers #################### #################### #################### # Build the transcript object tx_obj = build_transcript(tx_data, build) transcript_objs.append(tx_obj) # Load all transcripts LOG.info("Loading transcripts...") if len(transcript_objs) > 0: adapter.load_transcript_bulk(transcript_objs) LOG.info('Number of transcripts in build %s: %s', build, nr_transcripts) LOG.info('Number of transcripts with refseq identifier: %s', ref_seq_transcripts) LOG.info('Number of primary transcripts: %s', nr_primary_transcripts) return transcript_objs
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent', user_mail='*****@*****.**', api_key=None, demo=False): """docstring for setup_scout""" ########################## Delete previous information ########################## LOG.info("Deleting previous database") for collection_name in adapter.db.collection_names(): if not collection_name.startswith('system'): LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) LOG.info("Database deleted") ########################## Add a institute ########################## ##################################################################### # Build a institute with id institute_name institute_obj = build_institute(internal_id=institute_id, display_name=institute_id, sanger_recipients=[user_mail]) # Add the institute to database adapter.add_institute(institute_obj) ########################## Add a User ############################### ##################################################################### # Build a user obj user_obj = dict(_id=user_mail, email=user_mail, name=user_name, roles=['admin'], institutes=[institute_id]) adapter.add_user(user_obj) ### Get the mim information ### if not demo: # Fetch the mim files try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() mim2gene_lines = mim_files['mim2genes'] genemap_lines = mim_files['genemap2'] # Fetch the genes to hpo information hpo_gene_lines = fetch_hpo_genes() # Fetch the latest version of the hgnc information hgnc_lines = fetch_hgnc() # Fetch the latest exac pli score information exac_lines = fetch_exac_constraint() else: mim2gene_lines = [ line for line in get_file_handle(mim2gene_reduced_path) ] genemap_lines = [ line for line in get_file_handle(genemap2_reduced_path) ] # Fetch the genes to hpo information hpo_gene_lines = [ line for line in get_file_handle(hpogenes_reduced_path) ] # Fetch the reduced hgnc information hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)] # Fetch the latest exac pli score information exac_lines = [line for line in get_file_handle(exac_reduced_path)] builds = ['37', '38'] ################## Load Genes and transcripts ####################### ##################################################################### for build in builds: # Fetch the ensembl information if not demo: ensembl_genes = fetch_ensembl_genes(build=build) else: ensembl_genes = get_file_handle(genes37_reduced_path) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_gene_lines, build=build, ) # Create a map from ensembl ids to gene objects ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl if not demo: ensembl_transcripts = fetch_ensembl_transcripts(build=build) else: ensembl_transcripts = get_file_handle(transcripts37_reduced_path) # Load the transcripts for a certain build transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) hpo_terms_handle = None hpo_to_genes_handle = None hpo_disease_handle = None if demo: hpo_terms_handle = get_file_handle(hpoterms_reduced_path) hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path) hpo_disease_handle = get_file_handle( hpo_phenotype_to_terms_reduced_path) load_hpo(adapter=adapter, hpo_lines=hpo_terms_handle, hpo_gene_lines=hpo_to_genes_handle, disease_lines=genemap_lines, hpo_disease_lines=hpo_disease_handle) # If demo we load a gene panel and some case information if demo: parsed_panel = parse_gene_panel(path=panel_path, institute='cust000', panel_id='panel1', version=1.0, display_name='Test panel') adapter.load_panel(parsed_panel) case_handle = get_file_handle(load_path) case_data = yaml.load(case_handle) adapter.load_case(case_data) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent', user_mail='*****@*****.**', api_key=None, demo=False): """docstring for setup_scout""" ########################## Delete previous information ########################## LOG.info("Deleting previous database") for collection_name in adapter.db.collection_names(): if not collection_name.startswith('system'): LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) LOG.info("Database deleted") ########################## Add a institute ########################## ##################################################################### # Build a institute with id institute_name institute_obj = build_institute( internal_id=institute_id, display_name=institute_id, sanger_recipients=[user_mail] ) # Add the institute to database adapter.add_institute(institute_obj) ########################## Add a User ############################### ##################################################################### # Build a user obj user_obj = dict( _id=user_mail, email=user_mail, name=user_name, roles=['admin'], institutes=[institute_id] ) adapter.add_user(user_obj) ### Get the mim information ### if not demo: # Fetch the mim files try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) raise err mim2gene_lines = mim_files['mim2genes'] genemap_lines = mim_files['genemap2'] # Fetch the genes to hpo information hpo_gene_lines = fetch_hpo_genes() # Fetch the latest version of the hgnc information hgnc_lines = fetch_hgnc() # Fetch the latest exac pli score information exac_lines = fetch_exac_constraint() else: mim2gene_lines = [line for line in get_file_handle(mim2gene_reduced_path)] genemap_lines = [line for line in get_file_handle(genemap2_reduced_path)] # Fetch the genes to hpo information hpo_gene_lines = [line for line in get_file_handle(hpogenes_reduced_path)] # Fetch the reduced hgnc information hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)] # Fetch the latest exac pli score information exac_lines = [line for line in get_file_handle(exac_reduced_path)] builds = ['37', '38'] ################## Load Genes and transcripts ####################### ##################################################################### for build in builds: # Fetch the ensembl information if not demo: ensembl_genes = fetch_ensembl_genes(build=build) else: ensembl_genes = get_file_handle(genes37_reduced_path) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_gene_lines, build=build, ) # Create a map from ensembl ids to gene objects ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl if not demo: ensembl_transcripts = fetch_ensembl_transcripts(build=build) else: ensembl_transcripts = get_file_handle(transcripts37_reduced_path) # Load the transcripts for a certain build transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) hpo_terms_handle = None hpo_to_genes_handle = None hpo_disease_handle = None if demo: hpo_terms_handle = get_file_handle(hpoterms_reduced_path) hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path) hpo_disease_handle = get_file_handle(hpo_phenotype_to_terms_reduced_path) load_hpo( adapter=adapter, hpo_lines=hpo_terms_handle, hpo_gene_lines=hpo_to_genes_handle, disease_lines=genemap_lines, hpo_disease_lines=hpo_disease_handle ) # If demo we load a gene panel and some case information if demo: parsed_panel = parse_gene_panel( path=panel_path, institute='cust000', panel_id='panel1', version=1.0, display_name='Test panel' ) adapter.load_panel(parsed_panel) case_handle = get_file_handle(load_path) case_data = yaml.load(case_handle, Loader=yaml.FullLoader) adapter.load_case(case_data) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def genes(context, build, api_key): """ Load the hgnc aliases to the mongo database. """ LOG.info("Running scout update genes") adapter = context.obj['adapter'] # Fetch the omim information api_key = api_key or context.obj.get('omim_api_key') if not api_key: LOG.warning("Please provide a omim api key to load the omim gene panel") context.abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") LOG.warning("Dropping all transcript information") adapter.drop_transcripts(build) LOG.info("transcripts dropped") hpo_genes = fetch_hpo_genes() if build: builds = [build] else: builds = ['37', '38'] hgnc_lines = fetch_hgnc() exac_lines = fetch_exac_constraint() for build in builds: ensembl_genes = fetch_ensembl_genes(build=build) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, build=build, ) ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl ensembl_transcripts = fetch_ensembl_transcripts(build=build) transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) adapter.update_indexes() LOG.info("Genes, transcripts and Exons loaded")