Esempio n. 1
0
def test_fetch_exac_constraint_failed_ftp(variant_clinical_file, mocker):
    """Test fetch exac constraint file when ftp request fails"""

    # GIVEN file with hgnc info
    # GIVEN a mocked call that raises a HTTPError when fetching from ftp
    mocker.patch.object(scout_requests.urllib.request, "urlopen")
    url = (
        "https://storage.googleapis.com/gnomad-public/legacy/exacv1_downloads/release0.3.1"
        "/manuscript_data/forweb_cleaned_exac_r03_march16_z_data_pLI.txt.gz")
    scout_requests.urllib.request.urlopen.return_value = HTTPError(
        url, 500, "Internal Error", {}, None)
    # GIVEN a gzipped file
    with open(variant_clinical_file, "rb") as zipped_file:
        content = zipped_file.read()

    responses.add(
        responses.GET,
        url,
        body=content,
        status=200,
    )

    # WHEN fetching the resource
    data = scout_requests.fetch_exac_constraint()

    # THEN some content is returned
    assert len(data) > 10
Esempio n. 2
0
def print_exac(out_dir):
    """Print ExAC file to a directory

    Args:
        out_dir(Path)
    """
    file_name = "fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt"
    file_path = out_dir / file_name
    LOG.info("Download ExAC gene info to %s", file_path)
    with file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_exac_constraint():
            outfile.write(line + "\n")
Esempio n. 3
0
def test_fetch_exac_constraint(exac_file, mocker):
    """Test fetch exac constraint file"""

    # GIVEN file with hgnc info
    mocker.patch.object(scout_requests.urllib.request, "urlopen")
    with open(exac_file, "rb") as exac_handle:
        exac_info = exac_handle.read()
    with tempfile.TemporaryFile() as temp:
        temp.write(exac_info)
        temp.seek(0)
        scout_requests.urllib.request.urlopen.return_value = temp
        # WHEN fetching the resource
        data = scout_requests.fetch_exac_constraint()

    # THEN assert that the exac header is there
    assert "transcript\tgene" in data[0]
def generate_exac_genes(genes):
    """Generate a reduced file with omim mim2gene information

    Args:
        genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value
        outpath(str)

    Yields:
        print_line(str): Lines from the reduced file
    """
    exac_lines = fetch_exac_constraint()

    yield (exac_lines[0])

    for gene_info in parse_exac_genes(exac_lines):
        hgnc_symbol = gene_info.get("hgnc_symbol")
        if not hgnc_symbol:
            continue
        if hgnc_symbol in genes:
            yield gene_info["raw"]
Esempio n. 5
0
def load_hgnc_genes(
    adapter,
    genes=None,
    ensembl_lines=None,
    hgnc_lines=None,
    exac_lines=None,
    mim2gene_lines=None,
    genemap_lines=None,
    hpo_lines=None,
    build="37",
    omim_api_key="",
):
    """Load genes into the database

    link_genes will collect information from all the different sources and
    merge it into a dictionary with hgnc_id as key and gene information as values.

    Args:
        adapter(scout.adapter.MongoAdapter)
        genes(dict): If genes are already parsed
        ensembl_lines(iterable(str)): Lines formated with ensembl gene information
        hgnc_lines(iterable(str)): Lines with gene information from genenames.org
        exac_lines(iterable(str)): Lines with information pLi-scores from ExAC
        mim2gene(iterable(str)): Lines with map from omim id to gene symbol
        genemap_lines(iterable(str)): Lines with information of omim entries
        hpo_lines(iterable(str)): Lines information about map from hpo terms to genes
        build(str): What build to use. Defaults to '37'

    Returns:
        gene_objects(list): A list with all gene_objects that was loaded into database
    """
    gene_objects = list()

    if not genes:
        # Fetch the resources if not provided
        if ensembl_lines is None:
            ensembl_lines = fetch_ensembl_genes(build=build)
        hgnc_lines = hgnc_lines or fetch_hgnc()
        exac_lines = exac_lines or fetch_exac_constraint()
        if not (mim2gene_lines and genemap_lines):
            if not omim_api_key:
                LOG.warning("No omim api key provided!")
            else:
                mim_files = fetch_mim_files(omim_api_key,
                                            mim2genes=True,
                                            genemap2=True)
                mim2gene_lines = mim_files["mim2genes"]
                genemap_lines = mim_files["genemap2"]
        if not hpo_lines:
            hpo_files = fetch_hpo_files(hpogenes=True)
            hpo_lines = hpo_files["hpogenes"]

        # Link the resources
        genes = link_genes(
            ensembl_lines=ensembl_lines,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            hpo_lines=hpo_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
        )

    non_existing = 0
    nr_genes = len(genes)

    with progressbar(genes.values(), label="Building genes",
                     length=nr_genes) as bar:
        for gene_data in bar:
            if not gene_data.get("chromosome"):
                LOG.debug(
                    "skipping gene: %s. No coordinates found",
                    gene_data.get("hgnc_symbol", "?"),
                )
                non_existing += 1
                continue

            gene_obj = build_hgnc_gene(gene_data, build=build)
            gene_objects.append(gene_obj)

    LOG.info("Loading genes build %s", build)
    adapter.load_hgnc_bulk(gene_objects)

    LOG.info("Loading done. %s genes loaded", len(gene_objects))
    LOG.info("Nr of genes without coordinates in build %s: %s", build,
             non_existing)

    return gene_objects
Esempio n. 6
0
def setup_scout(
    adapter,
    institute_id="cust000",
    user_name="Clark Kent",
    user_mail="*****@*****.**",
    api_key=None,
    demo=False,
    resource_files=None,
):
    """Function to setup a working scout instance.

    WARNING: If the instance is populated all collections will be deleted

    Build insert a institute and an admin user.
    There are multiple sources of information that is used by scout and that needs to exist for
    scout to work proper.

    Genes:
         Scout uses HGNC as the source for gene identifiers en ensembl as source for coordinates.
         Additional information of disease connections for genes if fetched from OMIM.
         Link between hpo terms and genes is fetched from HPO
         For more details check the documentation.

    """

    LOG.info("Check if there was a database, delete if existing")
    existing_database = False
    for collection_name in adapter.db.list_collection_names():
        if collection_name.startswith("system"):
            continue
        LOG.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
        existing_database = True

    if existing_database:
        LOG.info("Database deleted")

    institute_obj = build_institute(
        internal_id=institute_id,
        display_name=institute_id,
        sanger_recipients=[user_mail],
    )
    adapter.add_institute(institute_obj)

    user_obj = dict(
        _id=user_mail,
        email=user_mail,
        name=user_name,
        roles=["admin"],
        institutes=[institute_id],
    )

    adapter.add_user(user_obj)

    resource_files = resource_files or {}
    if demo:
        resource_files = demo_files
    mim2gene_lines = None
    genemap_lines = None
    mim2gene_path = resource_files.get("mim2gene_path")
    genemap_path = resource_files.get("genemap_path")
    if genemap_path and mim2gene_path:
        mim2gene_lines = [line for line in get_file_handle(mim2gene_path)]
        genemap_lines = [line for line in get_file_handle(genemap_path)]

    if (genemap_lines is None) and api_key:
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise err
        mim2gene_lines = mim_files["mim2genes"]
        genemap_lines = mim_files["genemap2"]

    if resource_files.get("hpogenes_path"):
        hpo_gene_lines = [
            line
            for line in get_file_handle(resource_files.get("hpogenes_path"))
        ]
    else:
        hpo_gene_lines = fetch_genes_to_hpo_to_disease()

    if resource_files.get("hgnc_path"):
        hgnc_lines = [
            line for line in get_file_handle(resource_files.get("hgnc_path"))
        ]
    else:
        hgnc_lines = fetch_hgnc()

    if resource_files.get("exac_path"):
        exac_lines = [
            line for line in get_file_handle(resource_files.get("exac_path"))
        ]
    else:
        exac_lines = fetch_exac_constraint()

    # Load cytobands into cytoband collection
    for genome_build, cytobands_path in cytoband_files.items():
        load_cytobands(cytobands_path, genome_build, adapter)

    builds = ["37", "38"]
    for build in builds:
        genes_path = "genes{}_path".format(build)
        if resource_files.get(genes_path):
            ensembl_genes = get_file_handle(resource_files[genes_path])
        else:
            ensembl_genes = fetch_ensembl_genes(build=build)

        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        tx_path = "transcripts{}_path".format(build)
        if resource_files.get(tx_path):
            ensembl_transcripts = get_file_handle(resource_files[tx_path])
        else:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    hpo_terms_handle = None
    if resource_files.get("hpoterms_path"):
        hpo_terms_handle = get_file_handle(resource_files["hpoterms_path"])

    hpo_to_genes_handle = None
    if resource_files.get("hpo_to_genes_path"):
        hpo_to_genes_handle = get_file_handle(
            resource_files["hpo_to_genes_path"])

    hpo_disease_handle = None
    if resource_files.get("hpo_disease_path"):
        hpo_disease_handle = get_file_handle(
            resource_files["hpo_disease_path"])

    load_hpo(
        adapter=adapter,
        disease_lines=genemap_lines,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
    )

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(
            path=panel_path,
            institute="cust000",
            panel_id="panel1",
            version=1.0,
            display_name="Test panel",
        )
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle, Loader=yaml.FullLoader)
        config_data = parse_case_data(config=case_data)
        adapter.load_case(config_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
Esempio n. 7
0
def genes(build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = store

    # Fetch the omim information
    api_key = api_key or current_app.config.get("OMIM_API_KEY")
    mim_files = {}
    if not api_key:
        LOG.warning("No omim api key provided, Please not that some information will be missing")

    else:
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise click.Abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_genes_to_hpo_to_disease()

    if build:
        builds = [build]
    else:
        builds = ["37", "38"]

    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()

    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)

        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files.get("mim2genes"),
            genemap_lines=mim_files.get("genemap2"),
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)

        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    adapter.update_indexes()

    LOG.info("Genes, transcripts and Exons loaded")