def hpo(out_dir, terms, genes, disease): """Download all files necessary for HPO If terms or genes or disease is used print this to terminal """ kwargs = { "genes_to_phenotype": True, "phenotype_to_genes": True, "hpo_terms": True } if terms or genes or disease: kwargs = { "genes_to_phenotype": genes, "phenotype_to_genes": disease, "hpo_terms": terms, } hpo_info = fetch_hpo_files(**kwargs) if terms: info = hpo_info["hpo_terms"] elif genes: info = hpo_info["genes_to_phenotype"] else: info = hpo_info["phenotype_to_genes"] for line in info: click.echo(line) return hpo_info = fetch_hpo_files(**kwargs) out_dir = pathlib.Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) LOG.info("Download HPO resources to %s", out_dir) print_hpo(out_dir, hpo_info)
def test_fetch_hpo_files(phenotype_to_genes_file, hpo_genes_file): """Test fetch hpo files""" # GIVEN URLs two hpo files url_1 = scout_requests.HPO_URL.format("phenotype_to_genes.txt") url_2 = scout_requests.HPO_URL.format("genes_to_phenotype.txt") with open(phenotype_to_genes_file, "r") as hpo_file: content = hpo_file.read() responses.add( responses.GET, url_1, body=content, status=200, ) with open(hpo_genes_file, "r") as hpo_file: content = hpo_file.read() responses.add( responses.GET, url_2, body=content, status=200, ) # WHEN fetching all hpo files res = scout_requests.fetch_hpo_files(genes_to_phenotype=True, phenotype_to_genes=True) # THEN assert that the HPO header is there assert isinstance(res, dict)
def generate_hpo_files(genes): """Generate files with hpo reduced information""" hpo_files = fetch_hpo_files(hpogenes=True, hpoterms=True, phenotype_to_terms=True, hpodisease=False) file_names = { "hpogenes": hpogenes_reduced_path, "hpoterms": hpoterms_reduced_path, "phenotype_to_terms": hpo_phenotype_to_terms_reduced_path, } for name in file_names: hpo_lines = hpo_files[name] out_path = file_names[name] outfile = open(out_path, "w") LOG.info("Writing file %s", out_path) for i, line in enumerate(hpo_lines): line = line.rstrip() if not len(line) > 1: continue if i == 0: # Header line outfile.write(line + "\n") continue splitted_line = line.split("\t") if name == "hpogenes": hgnc_symbol = splitted_line[1] elif name == "hpoterms": hgnc_symbol = splitted_line[3] elif name == "phenotype_to_terms": hgnc_symbol = splitted_line[1] if hgnc_symbol in genes: outfile.write(line + "\n") LOG.info("File ready")
def load_hgnc_genes( adapter, genes=None, ensembl_lines=None, hgnc_lines=None, exac_lines=None, mim2gene_lines=None, genemap_lines=None, hpo_lines=None, build="37", omim_api_key="", ): """Load genes into the database link_genes will collect information from all the different sources and merge it into a dictionary with hgnc_id as key and gene information as values. Args: adapter(scout.adapter.MongoAdapter) genes(dict): If genes are already parsed ensembl_lines(iterable(str)): Lines formated with ensembl gene information hgnc_lines(iterable(str)): Lines with gene information from genenames.org exac_lines(iterable(str)): Lines with information pLi-scores from ExAC mim2gene(iterable(str)): Lines with map from omim id to gene symbol genemap_lines(iterable(str)): Lines with information of omim entries hpo_lines(iterable(str)): Lines information about map from hpo terms to genes build(str): What build to use. Defaults to '37' Returns: gene_objects(list): A list with all gene_objects that was loaded into database """ gene_objects = list() if not genes: # Fetch the resources if not provided if ensembl_lines is None: ensembl_lines = fetch_ensembl_genes(build=build) hgnc_lines = hgnc_lines or fetch_hgnc() exac_lines = exac_lines or fetch_exac_constraint() if not (mim2gene_lines and genemap_lines): if not omim_api_key: LOG.warning("No omim api key provided!") else: mim_files = fetch_mim_files(omim_api_key, mim2genes=True, genemap2=True) mim2gene_lines = mim_files["mim2genes"] genemap_lines = mim_files["genemap2"] if not hpo_lines: hpo_files = fetch_hpo_files(hpogenes=True) hpo_lines = hpo_files["hpogenes"] # Link the resources genes = link_genes( ensembl_lines=ensembl_lines, hgnc_lines=hgnc_lines, exac_lines=exac_lines, hpo_lines=hpo_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, ) non_existing = 0 nr_genes = len(genes) with progressbar(genes.values(), label="Building genes", length=nr_genes) as bar: for gene_data in bar: if not gene_data.get("chromosome"): LOG.debug( "skipping gene: %s. No coordinates found", gene_data.get("hgnc_symbol", "?"), ) non_existing += 1 continue gene_obj = build_hgnc_gene(gene_data, build=build) gene_objects.append(gene_obj) LOG.info("Loading genes build %s", build) adapter.load_hgnc_bulk(gene_objects) LOG.info("Loading done. %s genes loaded", len(gene_objects)) LOG.info("Nr of genes without coordinates in build %s: %s", build, non_existing) return gene_objects