Esempio n. 1
0
def test_metafile_provided_not_supported_anymore():
    with pytest.raises(RuntimeError):
        read_bgen(
            example_filepath("haplotypes.bgen"),
            metafile_filepath=example_filepath("haplotypes.bgen.metadata.valid"),
            verbose=False,
        )
Esempio n. 2
0
def test_bgen_samples_outside_bgen_unreadable(tmp_path):
    bgen_filepath = example_filepath("complex.23bits.bgen")
    samples_filepath = tmp_path / "complex.sample"
    copyfile(example_filepath("complex.sample"), samples_filepath)
    with noread_permission(samples_filepath):
        with pytest.raises(PermissionError):
            read_bgen(bgen_filepath, samples_filepath=samples_filepath, verbose=False)
Esempio n. 3
0
def test_bgen_reader_with_wrong_metadata_file():
    filepath = example_filepath("example.32bits.bgen")
    filepath.touch()
    metafile_filepath = example_filepath("wrong.metadata")
    metafile_filepath.touch()  # make sure that the metafile has a later timestamp (otherwise, it might be re-created)
    with pytest.raises(RuntimeError):
        read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
Esempio n. 4
0
def test_metafile_not_provided_no_permission_to_create(tmp_path):
    src = example_filepath("haplotypes.bgen")
    dst = tmp_path / "haplotypes.bgen"
    copyfile(src, dst)
    path = os.path.dirname(dst)
    with nowrite_permission(path):
        with pytest.warns(UserWarning):
            read_bgen(dst, verbose=False)
Esempio n. 5
0
def test_bgen_reader_with_nonexistent_metadata_file():
    filepath = example_filepath("example.32bits.bgen")
    folder = os.path.dirname(filepath)
    metafile_filepath = os.path.join(folder, "nonexistent.metadata")

    with pytest.raises(FileNotFoundError):
        with pytest.warns(UserWarning):
            read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
Esempio n. 6
0
def test_allele_expectation_interface():
    bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False)
    with pytest.raises(ValueError):
        allele_expectation(bgen, 1)

    bgen = read_bgen(example_filepath("complex.23bits.bgen"), verbose=False)
    e = allele_expectation(bgen, 3)
    assert_allclose(
        e,
        [[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0], [0.0, 2.0, 0.0]])
Esempio n. 7
0
def test_bgen_reader_phased_genotype():
    filepath = example_filepath("haplotypes.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"]
    samples = bgen["samples"]

    v = variants.loc[0].compute()
    assert_equal(v["chrom"].values[0], "1")
    assert_equal(v["id"].values[0], "SNP1")
    assert_equal(v["nalleles"].values[0], 2)
    assert_equal(v["allele_ids"].values[0], "A,G")
    assert_equal(v["pos"].values[0], 1)
    assert_equal(v["rsid"].values[0], "RS1")

    v = variants.loc[2].compute()
    assert_equal(v["chrom"].values[0], "1")
    assert_equal(v["id"].values[0], "SNP3")
    assert_equal(v["nalleles"].values[0], 2)
    assert_equal(v["allele_ids"].values[0], "A,G")
    assert_equal(v["pos"].values[0], 3)
    assert_equal(v["rsid"].values[0], "RS3")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[2], "sample_2")

    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_3")

    g = bgen["genotype"][0].compute()
    assert_allclose(g["probs"][0], [1.0, 0.0, 1.0, 0.0])
    k = len(variants)
    n = len(samples)
    g = bgen["genotype"][k - 1].compute()
    assert_allclose(g["probs"][n - 1], [1.0, 0.0, 0.0, 1.0])
def test_large_file(large_bgen_filepath):
    data = read_bgen(large_bgen_filepath, verbose=True)
    variants = data["variants"]
    samples = data["samples"]
    genotype = data["genotype"]
    df = variants[(variants["chrom"] == "3")
                  & (variants["pos"] > 250000)
                  & (variants["pos"] < 290000)].compute()
    assert_equal(len(df), 14)
    assert_equal(df.iloc[0]["id"], "")
    assert_equal(df.iloc[0]["rsid"], "Human_STR_911968")
    assert_equal(df.iloc[0]["nalleles"], 2)
    assert_equal(df.iloc[0]["allele_ids"], "<CNV>,CACAC")
    assert_equal(df.iloc[0]["vaddr"], 73115140)

    assert_equal(samples[382], "SAMEA2586986")
    assert_equal(df.index[0], 283970)
    g = genotype[df.index[0]].compute()
    assert_equal(g["probs"].shape, (415, 3))

    assert_allclose(g["probs"][410, :],
                    [0.0, 0.4268863965819791, 0.5731136034180209])
    assert_equal(g["phased"], False)
    assert_equal(g["missing"][3], False)
    assert_equal(g["ploidy"][3], 2)
Esempio n. 9
0
def test_custom_meta_path():
    filepath = example_filepath2("example.bgen")
    custom_path = Path(BGEN_READER_CACHE_HOME, "submeta")

    try:
        os.mkdir(custom_path)
    except FileExistsError:
        pass

    custom_meta_path(custom_path)
    read_bgen(filepath)
    open_bgen(filepath)

    assert len([path for path in Path(custom_path).iterdir()]) == 2, "Failed to write files to custom directory"
    os.remove(Path(custom_path, filepath.name + ".metadata2.mmm"))
    os.remove(Path(custom_path, filepath.name + ".metafile"))
Esempio n. 10
0
def test_bgen_reader():
    folder = os.path.dirname(os.path.abspath(__file__)).encode()
    filepath = os.path.join(folder, b"example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen['variants']
    samples = bgen['samples']
    genotype = bgen['genotype']

    assert_equal(variants.loc[0, 'chrom'], '01')
    assert_equal(variants.loc[0, 'id'], 'SNPID_2')
    assert_equal(variants.loc[0, 'nalleles'], 2)
    assert_equal(variants.loc[0, 'allele_ids'], "A,G")
    assert_equal(variants.loc[0, 'pos'], 2000)
    assert_equal(variants.loc[0, 'rsid'], 'RSID_2')

    assert_equal(variants.loc[7, 'chrom'], '01')
    assert_equal(variants.loc[7, 'id'], 'SNPID_9')
    assert_equal(variants.loc[7, 'nalleles'], 2)
    assert_equal(variants.loc[7, 'allele_ids'], "A,G")
    assert_equal(variants.loc[7, 'pos'], 9000)
    assert_equal(variants.loc[7, 'rsid'], 'RSID_9')

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, 'chrom'], '01')
    assert_equal(variants.loc[n - 1, 'id'], 'SNPID_200')
    assert_equal(variants.loc[n - 1, 'nalleles'], 2)
    assert_equal(variants.loc[n - 1, 'allele_ids'], "A,G")
    assert_equal(variants.loc[n - 1, 'pos'], 100001)
    assert_equal(variants.loc[n - 1, 'rsid'], 'RSID_200')

    assert_equal(samples.loc[0, 'id'], 'sample_001')
    assert_equal(samples.loc[7, 'id'], 'sample_008')

    n = samples.shape[0]
    assert_equal(samples.loc[n - 1, 'id'], 'sample_500')
Esempio n. 11
0
    def process_metadata(self, metadata_file, show_progress=False):
        # TODO Should make this an explicit requirement rather than hardcoding.
        withdrawn_ids = set()
        with open("ukbb_withdrawn.csv") as f:
            for line in f:
                withdrawn_ids.add(int(line))

        # The sample IDs aren't in the BGEN file so we have to match by the Order
        # field, which gives the order that each sample is at in the BGEN (0 based).
        metadata_df = pd.read_csv(metadata_file)
        metadata_df.sort_values(by="Order", inplace=True)
        metadata_df = metadata_df.set_index("Order")

        bgen = bgen_reader.read_bgen(self.data_file, verbose=False)
        sample_df = bgen['samples']
        keep_samples = []
        row_iter = tqdm.tqdm(metadata_df.iterrows(),
                             total=len(metadata_df),
                             disable=not show_progress)
        for index, row in row_iter:
            if not pd.isnull(index):
                order = int(index)
                if int(row.SampleID) not in withdrawn_ids:
                    keep_samples.extend([2 * order, 2 * order + 1])
                    metadata = {}
                    for k, v in row.items():
                        metadata[k] = None if pd.isnull(v) else str(v)
                    self.samples.add_individual(ploidy=2, metadata=metadata)
        self.num_samples = len(keep_samples)
        self.keep_index = np.array(keep_samples, dtype=int)
Esempio n. 12
0
def read_bgen(filename: str, verbose: bool = False) -> Dict:
    """Wrapper of bgen_reader.read_bgen that checks if sample ids make sense."""
    bgen = bgen_reader.read_bgen(filename, verbose=verbose)
    if bgen['samples'][0] == 'sample_0':
        print(
            'WARNING: Sample ids in bgen file are generic. Trying to read the corresponding .sample file ...'
        )
        samples_filepath = filename[:-4] + 'sample'
        if not path.exists(samples_filepath):
            raise FileNotFoundError(f'{samples_filepath} does not exist.')
        bgen = bgen_reader.read_bgen(filename,
                                     verbose=verbose,
                                     samples_filepath=samples_filepath)
        bgen['samples'] = pd.read_csv(samples_filepath,
                                      sep='\s+')['ID_2'][1:].to_numpy()
    return bgen
Esempio n. 13
0
def test_bgen_reader_convert_to_dosage():
    folder = os.path.dirname(os.path.abspath(__file__)).encode()
    filepath = os.path.join(folder, b"example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    genotype = bgen['genotype']
    dosage = convert_to_dosage(genotype, verbose=False)
    assert_allclose(dosage[0, 1:3], array([1.93575854, 1.91558579]), rtol=1e-5)
Esempio n. 14
0
def read(filepath,
         size=50,
         verbose=True,
         metadata_file=True,
         sample_file=None):
    r"""Read a given BGEN file.

    Parameters
    ----------
    filepath : str
        A BGEN file path.
    size : float, optional
        Chunk size in megabytes. Defaults to ``50``.
    verbose : bool, optional
        ``True`` to show progress; ``False`` otherwise.
    metadata_file : bool, str, optional
        If ``True``, it will try to read the variants metadata from the
        metadata file ``filepath + ".metadata"``. If this is not possible,
        the variants metadata will be read from the BGEN file itself. If
        ``filepath + ".metadata"`` does not exist, it will try to create one
        with the same name to speed up reads. If ``False``, variants metadata
        will be read only from the BGEN file. If a file path is given instead,
        it assumes that the specified metadata file is valid and readable and
        therefore it will read variants metadata from that file only. Defaults
        to ``True``.
    sample_file : str, optional
        A sample file in `GEN format <http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html>`_.
        If sample_file is provided, sample IDs are read from this file. Otherwise, it
        reads from the BGEN file itself if present. Defaults to ``None``.

    Returns
    -------
    variants : :class:`pandas.DataFrame`
        Variant position, chromossomes, RSIDs, etc.
    samples : :class:`pandas.DataFrame`
        Sample identifications.
    genotype : :class:`dask.array.Array`
        Array of genotype references.
    X : :class:`dask.array.Array`
        Allele probabilities.

    Note
    ----
    Metadata files can speed up subsequent reads tremendously. But often the user does
    not have write permission for the default metadata file location
    ``filepath + ".metadata"``. We thus provide the
    :func:`limix.io.bgen.create_metadata_file` function for creating one at the
    given path.
    """
    from bgen_reader import read_bgen

    print("Reading {}...".format(filepath))
    return read_bgen(
        filepath,
        size=size,
        verbose=verbose,
        metadata_file=metadata_file,
        sample_file=sample_file,
    )
Esempio n. 15
0
def test_read_bgem_interface():
    filepath = example_filepath("haplotypes.bgen")
    bgen = read_bgen(filepath, verbose=False)
    assert isinstance(bgen, dict)
    assert isinstance(bgen["variants"], dd.DataFrame)
    assert isinstance(bgen["samples"], Series)
    assert isinstance(bgen["genotype"], list)
    assert isinstance(bgen["genotype"][0], Delayed)
Esempio n. 16
0
def test_allele_frequency_interface():
    filepath = example_filepath("complex.23bits.bgen")
    with pytest.raises(ValueError):
        bgen = read_bgen(filepath, verbose=False)
        allele_expectation(bgen, 1)

    bgen = read_bgen(filepath, verbose=False)
    expec = allele_expectation(bgen, 3)
    freq = allele_frequency(expec)
    assert_allclose(freq, [1.33333333333, 1.0, 0.0])

    freq = allele_frequency([[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0],
                             [0.0, 2.0, 0.0]])
    assert_allclose(freq, [1.33333333333, 1.0, 0.0])

    with pytest.raises(ValueError):
        allele_frequency([2, 3, 1])
Esempio n. 17
0
def make_dosage(f):
    bgen = bgen_reader.read_bgen(f,
                                 size=200,
                                 sample_file=samplefile,
                                 verbose=False)
    genotype = bgen['genotype'][:, mask, :].astype(dtype)
    dosage = genotype[:, :, 1] + 2. * genotype[:, :, 2]
    return dosage, bgen
Esempio n. 18
0
def bgen_file_geno_lines(file, variant_mapping = None, force_colon = False, use_rsid=False, whitelist=None, skip_palindromic=False, liftover_conversion=None):
    logging.log(9, "Processing bgen %s", file)
    bgen = bgen_reader.read_bgen(file)
    variants = bgen["variants"]
    dict_mapping = variant_mapping is not None and type(variant_mapping) == dict
    for variant in variants.itertuples():
        if use_rsid:
            varid = variant.rsid
        else:
            varid = variant.id

        if force_colon:
            varid = varid.replace("_", ":")

        alleles = variant.allele_ids.split(",")
        if len(alleles) > 2:
            logging.info("variant %s is multiallelic, skipping", varid)
            continue
        allele_0, allele_1 = alleles[0], alleles[1]
        if skip_palindromic and Genomics.is_palindromic(allele_0, allele_1):
            continue

        pos = variant.pos
        chr = variant.chrom
        if liftover_conversion:
            chr_, pos_ = chr, pos
            chr, pos = liftover_conversion(chr, pos)
            if chr == "NA" or pos == "NA":
                continue

        if variant_mapping:
            if dict_mapping:
                if not varid in variant_mapping:
                    continue
                else:
                    varid_ = varid
                    varid = variant_mapping[varid]
            else:
                raise RuntimeError("BGEN code doessn't support variant mapping through a method")
        # subtlety: even though we replace the variant id,
        # the alleles in the genotype might be swapped respect the variant in the mapping
        # You should verify if you must match it


        if whitelist and not varid in whitelist:
            continue

        v = bgen["genotype"][variant.Index].compute()
        if v["phased"]:
            d = numpy.apply_along_axis(lambda x: x[1] + x[3], 1, numpy.array(v["probs"], dtype=numpy.float))
        else:
            d = numpy.apply_along_axis(lambda x: x[1] + x[2] * 2, 1, numpy.array(v["probs"], dtype=numpy.float))

        #e = bgen_reader.allele_expectation(bgen, variant.Index)
        #d2 = bgen_reader.compute_dosage(e, alt=1)

        yield (varid, chr, pos, allele_0, allele_1, numpy.mean(d)/2) + tuple(d)
Esempio n. 19
0
def test_bgen_reader_without_metadata():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen
    assert_equal(variants.loc[7, "allele_ids"], "A,G")
    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_500")
Esempio n. 20
0
def test_bgen_samples_specify_samples_file():
    data = read_bgen(
        example_filepath("complex.23bits.bgen"),
        samples_filepath=example_filepath("complex.sample"),
        verbose=False,
    )
    samples = ["sample_0", "sample_1", "sample_2", "sample_3"]
    samples = Series(samples, dtype=str, name="id")
    assert all(data["samples"] == samples)
Esempio n. 21
0
def run(path):
    ct = 0
    bgen = read_bgen(path, verbose=False)
    n = len(bgen["genotype"])
    print(f'Found {n} variants')
    for i in tqdm.tqdm(range(n), total=n):
        geno = bgen["genotype"][i].compute()
        # geno['probs'] is (n_samples, n_genotypes), i.e. (486443, 3) for UKB
        assert geno['probs'].ndim == 2
        ct += geno['probs'].shape[0]
    print(f'Number of entries read: {ct}')
Esempio n. 22
0
def test_dosage_example_32bits():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)

    e = allele_expectation(bgen, 5)
    assert_allclose(e[7], [1.9556273911044997, 0.044372608895500334])

    e = allele_expectation(bgen, 0)
    assert all(isnan(e[0]))

    e = allele_expectation(bgen, 0)
    assert_equal(e.shape, (500, 2))
Esempio n. 23
0
def test_bgen_reader_complex():
    filepath = example_filepath("complex.23bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen

    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 1)
    assert_equal(variants.loc[0, "rsid"], "V1")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "")
    assert_equal(variants.loc[7, "nalleles"], 7)
    assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT")
    assert_equal(variants.loc[7, "pos"], 8)
    assert_equal(variants.loc[7, "rsid"], "M8")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 10)
    assert_equal(variants.loc[n - 1, "rsid"], "M10")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[3], "sample_3")

    g = bgen["genotype"][0].compute()["probs"][0]
    assert_allclose(g[:2], [1, 0])
    assert isnan(g[2])

    g = bgen["genotype"][0].compute()["probs"][1]
    assert_allclose(g[:3], [1, 0, 0])

    g = bgen["genotype"][-1].compute()["probs"][-1]
    assert_allclose(g[:5], [0, 0, 0, 1, 0])

    ploidy = bgen["genotype"][0].compute()["ploidy"]
    assert_allclose(ploidy, [1, 2, 2, 2])
    ploidy = bgen["genotype"][-1].compute()["ploidy"]
    assert_allclose(ploidy, [4, 4, 4, 4])

    nvariants = len(variants)
    phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)]
    phased = array(phased)
    assert_equal(phased.dtype.name, "bool")
    ideal = array([False, True, True, False, True, True, True, True, False, False])
    assert array_equal(phased, ideal)
Esempio n. 24
0
    def process_sites(self, show_progress=False, max_sites=None):

        bgen = bgen_reader.read_bgen(self.data_file, verbose=False)
        num_alleles = np.array(bgen["variants"]["nalleles"])
        position = np.array(bgen["variants"]["pos"])
        rsid = np.array(bgen["variants"]["rsid"])
        allele_id = np.array(bgen["variants"]["allele_ids"])
        del bgen

        bg = simplebgen.BgenReader(self.data_file)
        N = 2 * bg.num_samples
        for j in tqdm.tqdm(range(bg.num_variants)):
            ancestral_state = self.get_ancestral_state(position[j])
            if ancestral_state is not None:
                alleles = allele_id[j].split(",")
                if num_alleles[j] != 2 or ancestral_state not in alleles:
                    self.num_non_biallelic += 1
                elif any(len(allele) != 1 for allele in alleles):
                    self.num_indels += 1
                else:
                    P = bg.get_probabilities(j).astype(np.int8).reshape((N, 2))
                    # The probabilities for each site is a (num_diploids, 4) array,
                    # in the form (n0_a0, n0_a1, n1_a0, n1_a1). These are always zero
                    # or one for the different alleles. We first flatten this array so
                    # that it's (N, 2) and then generate the genotypes based on that.
                    genotypes = np.zeros(N, dtype=np.int8)
                    if ancestral_state == alleles[0]:
                        genotypes[P[:, 1] == 1] = 1
                        ref = alleles[0]
                    else:
                        genotypes[P[:, 0] == 1] = 1
                        ref = alleles[0]
                        alleles = alleles[::-1]

                    freq = np.sum(genotypes)
                    if freq == self.num_samples or freq == 0:
                        self.num_invariant += 1
                    elif freq == 1:
                        self.num_singletons += 1
                    elif freq == self.num_samples - 1:
                        self.num_nmo_tons += 1
                    else:
                        metadata = {"ID": rsid[j], "REF": ref}
                        self.samples.add_site(
                            position=float(position[j]),
                            genotypes=genotypes[self.keep_index],
                            alleles=alleles,
                            metadata=metadata)
            if j == max_sites:
                break
        self.report()
Esempio n. 25
0
def test_bgen_reader_variants_info():
    filepath = example_filepath("example.32bits.bgen")
    bgen = read_bgen(filepath, verbose=False)
    variants = bgen["variants"]
    samples = bgen["samples"]
    assert "genotype" in bgen

    variants = variants.compute()
    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "SNPID_2")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 2000)
    assert_equal(variants.loc[0, "rsid"], "RSID_2")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "SNPID_9")
    assert_equal(variants.loc[7, "nalleles"], 2)
    assert_equal(variants.loc[7, "allele_ids"], "A,G")
    assert_equal(variants.loc[7, "pos"], 9000)
    assert_equal(variants.loc[7, "rsid"], "RSID_9")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "SNPID_200")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 100001)
    assert_equal(variants.loc[n - 1, "rsid"], "RSID_200")

    assert_equal(samples.loc[0], "sample_001")
    assert_equal(samples.loc[7], "sample_008")

    n = samples.shape[0]
    assert_equal(samples.loc[n - 1], "sample_500")

    g = bgen["genotype"][0].compute()["probs"]
    assert all(isnan(g[0, :]))

    g = bgen["genotype"][0].compute()["probs"]
    a = [0.027802362811705648, 0.00863673794284387, 0.9635608992454505]
    assert_allclose(g[1, :], a)

    b = [
        0.97970582847010945215516,
        0.01947019668749305418287,
        0.00082397484239749366197,
    ]
    g = bgen["genotype"][1].compute()["probs"]
    assert_allclose(g[2, :], b)
def check():
    filename = sys.argv[1]

    bgen = bgen_reader.read_bgen(filename, verbose=False)
    num_variants = len(bgen["variants"])
    num_samples = len(bgen["samples"])

    br = simplebgen.BgenReader(filename)
    assert br.num_variants == num_variants
    assert br.num_samples == num_samples

    for j in range(num_variants):
        p1 = br.get_probabilities(j)
        p2 = np.array(bgen["genotype"][j].compute())
        np.testing.assert_equal(p1, p2)

    print("all good")
Esempio n. 27
0
def test_compute_dosage():

    with example_files("example.32bits.bgen") as filepath:
        bgen = read_bgen(filepath, verbose=False)
        variant_idx = 2
        e = allele_expectation(bgen, variant_idx)
        dosage = compute_dosage(e)
        assert_allclose(
            dosage[:5],
            [
                0.015502935046214363,
                0.9938354277968955,
                1.9793395833064196,
                0.9956054727070978,
                1.978790270625332,
            ],
        )
Esempio n. 28
0
def test_bgen_reader_complex_sample_file():
    bgen = read_bgen(
        example_filepath("complex.23bits.bgen"),
        samples_filepath=example_filepath("complex.sample"),
        verbose=False,
    )
    variants = bgen["variants"].compute()
    samples = bgen["samples"]
    assert "genotype" in bgen

    assert_equal(variants.loc[0, "chrom"], "01")
    assert_equal(variants.loc[0, "id"], "")
    assert_equal(variants.loc[0, "nalleles"], 2)
    assert_equal(variants.loc[0, "allele_ids"], "A,G")
    assert_equal(variants.loc[0, "pos"], 1)
    assert_equal(variants.loc[0, "rsid"], "V1")

    assert_equal(variants.loc[7, "chrom"], "01")
    assert_equal(variants.loc[7, "id"], "")
    assert_equal(variants.loc[7, "nalleles"], 7)
    assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT")
    assert_equal(variants.loc[7, "pos"], 8)
    assert_equal(variants.loc[7, "rsid"], "M8")

    n = variants.shape[0]
    assert_equal(variants.loc[n - 1, "chrom"], "01")
    assert_equal(variants.loc[n - 1, "id"], "")
    assert_equal(variants.loc[n - 1, "nalleles"], 2)
    assert_equal(variants.loc[n - 1, "allele_ids"], "A,G")
    assert_equal(variants.loc[n - 1, "pos"], 10)
    assert_equal(variants.loc[n - 1, "rsid"], "M10")

    assert_equal(samples.loc[0], "sample_0")
    assert_equal(samples.loc[3], "sample_3")

    ploidy = bgen["genotype"][2].compute()["ploidy"]
    missing = bgen["genotype"][2].compute()["missing"]
    nvariants = len(variants)
    phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)]
    assert_allclose(ploidy, [1, 2, 2, 2])
    assert_allclose(missing, [0, 0, 0, 0])
    assert_allclose(phased, [0, 1, 1, 0, 1, 1, 1, 1, 0, 0])
Esempio n. 29
0
def test_bgen_reader_file_notfound():
    folder = os.path.dirname(os.path.abspath(__file__)).encode()
    filepath = os.path.join(folder, b"example.33bits.bgen")
    with pytest.raises(FileNotFoundError):
        read_bgen(filepath, verbose=False)
Esempio n. 30
0
from bgen_reader import read_bgen

if __name__ == '__main__':

    bgen = read_bgen("example.bgen", verbose=False)

    print(bgen['variants'].head())
    print(bgen['samples'].head())
    print(len(bgen['genotype']))
    print(bgen['genotype'][0].compute())