def test_write_nan_properties(shared_datadir, tmp_path): with open_bed(shared_datadir / "small.bed") as bed: val = bed.read() properties = bed.properties chrom = bed.chromosome.copy() chrom[bed.chromosome == "Y"] = 0 chrom = np.array(chrom, dtype="float") chrom2 = chrom.copy() chrom2[chrom2 == 0] = np.nan cm_p = bed.cm_position.copy() cm_p[cm_p < 3000] = 0 cm_p2 = cm_p.copy() cm_p2[cm_p == 0] = np.nan properties["chromosome"] = chrom2 properties["cm_position"] = cm_p2 output_file = tmp_path / "nan.bed" to_bed(output_file, val, properties=properties) with open_bed(output_file) as bed2: assert np.array_equal(bed2.chromosome, ["1.0", "1.0", "5.0", "0"]) assert np.array_equal(bed2.cm_position, cm_p) with open_bed( shared_datadir / "small.bed", properties={ "chromosome": chrom2, "cm_position": cm_p2 }, ) as bed3: assert np.array_equal(bed3.chromosome, ["1.0", "1.0", "5.0", "0"]) assert np.array_equal(bed3.cm_position, cm_p)
def test_writes_small(tmp_path): output_file = tmp_path / "small.bed" val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] properties = { "fid": ["fid1", "fid1", "fid2"], "iid": ["iid1", "iid2", "iid3"], "father": ["iid23", "iid23", "iid22"], "mother": ["iid34", "iid34", "iid33"], "sex": [1, 2, 0], "pheno": ["red", "red", "blue"], "chromosome": ["1", "1", "5", "Y"], "sid": ["sid1", "sid2", "sid3", "sid4"], "cm_position": [100.4, 2000.5, 4000.7, 7000.9], "bp_position": [1, 100, 1000, 1004], "allele_1": ["A", "T", "A", "T"], "allele_2": ["A", "C", "C", "G"], } to_bed(output_file, val, properties=properties) with open_bed(output_file) as bed: assert np.allclose(bed.read(), val, equal_nan=True) for key, value in bed.properties.items(): assert np.array_equal(value, properties[key]) or np.allclose( value, properties[key])
def test_bed_int8(tmp_path, shared_datadir): with open_bed(shared_datadir / "some_missing.bed") as bed: for force_python_only in [False, True]: for order in ["F", "C"]: val = bed.read(dtype="int8", force_python_only=force_python_only, order=order) assert val.dtype == np.int8 assert (val.flags["C_CONTIGUOUS"] and order == "C") or (val.flags["F_CONTIGUOUS"] and order == "F") ref_val = reference_val(shared_datadir) ref_val[ref_val != ref_val] = -127 ref_val = ref_val.astype("int8") assert np.array_equal(ref_val, val) output = str(tmp_path / "int8.bed") for count_A1 in [False, True]: to_bed( output, ref_val, count_A1=count_A1, force_python_only=force_python_only, ) with open_bed(output, count_A1=count_A1) as bed2: assert np.array_equal( bed2.read(dtype="int8", force_python_only=force_python_only), ref_val, )
def test_write12(tmp_path): # =================================== # Starting main function # =================================== logging.info("starting 'test_writes'") np.random.seed(0) output_template = str(tmp_path / "writes.{0}.bed") i = 0 for row_count in [0, 5, 2, 1]: for col_count in [0, 4, 2, 1]: val = np.random.randint(0, 4, size=(row_count, col_count)) * 1.0 val[val == 3] = np.NaN row0 = ["0", "1", "2", "3", "4"][:row_count] row1 = ["0", "1", "2", "3", "4"][:row_count] col = ["s0", "s1", "s2", "s3", "s4"][:col_count] for is_none in [True, False]: properties = {"fid": row0, "iid": row1, "sid": col} if is_none: col_prop012 = [x for x in range(5)][:col_count] properties["chromosome"] = col_prop012 properties["bp_position"] = col_prop012 properties["cm_position"] = col_prop012 else: col_prop012 = None filename = output_template.format(i) logging.info(filename) i += 1 to_bed(filename, val, properties=properties) for subsetter in [None, np.s_[::2, ::3]]: with open_bed(filename) as bed: val2 = bed.read(index=subsetter, order="C", dtype="float32") if subsetter is None: expected = val else: expected = val[subsetter[0], :][:, subsetter[1]] assert np.allclose(val2, expected, equal_nan=True) assert np.array_equal(bed.fid, np.array(row0, dtype="str")) assert np.array_equal(bed.iid, np.array(row1, dtype="str")) assert np.array_equal(bed.sid, np.array(col, dtype="str")) if col_prop012 is not None: assert np.array_equal( bed.chromosome, np.array(col_prop012, dtype="str")) assert np.array_equal(bed.bp_position, np.array(col_prop012)) assert np.array_equal(bed.cm_position, np.array(col_prop012)) try: os.remove(filename) except Exception: pass logging.info("done with 'test_writes'")
def test_write1_bed_f64cpp(tmp_path, shared_datadir): with open_bed(shared_datadir / "some_missing.bed") as bed: for iid_index in [0, 1, 5]: for force_python_only in [False, True]: val = bed.read( np.s_[0:iid_index, :], order="F", dtype=np.float64, force_python_only=force_python_only, ) assert val.shape == (iid_index, 100) output = str(tmp_path / f"toydata.F64cpp.{iid_index}") to_bed(output, val, count_A1=False) val2 = open_bed(output, count_A1=False).read(dtype="float64") assert np.allclose(val, val2, equal_nan=True)
def test_nones(shared_datadir, tmp_path): properties = { "father": None, "mother": None, "sex": None, "pheno": None, "allele_1": None, "allele_2": None, } with open_bed(shared_datadir / "small.bed", properties=properties) as bed: assert np.array_equal(bed.iid, ["iid1", "iid2", "iid3"]) assert bed.father is None val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] out_file = tmp_path / "testnones.bed" to_bed(out_file, val, properties=properties)
def test_coverage3(shared_datadir, tmp_path): with open_bed(shared_datadir / "small.bed", properties={"sex": [1.0, np.nan, 1.0, 2.0]}) as bed: assert np.array_equal(bed.sex, np.array([1, 0, 1, 2])) with open_bed( shared_datadir / "small.bed", properties={"cm_position": [1000.0, np.nan, 2000.0, 3000.0]}, ) as bed: assert np.array_equal(bed.cm_position, np.array([1000, 0, 2000, 3000])) list = [1.0, 0, np.nan, 0] output_file = tmp_path / "1d.bed" with pytest.raises(ValueError): to_bed(output_file, list) to_bed(output_file, np.array([list], dtype=np.float16)) with open_bed(output_file) as bed: assert np.allclose(bed.read(), [list], equal_nan=True)
def test_write1_x_x_cpp(tmp_path, shared_datadir): for count_A1 in [False, True]: with open_bed(shared_datadir / "some_missing.bed", count_A1=count_A1) as bed: for order in ["C", "F"]: for dtype in [np.float32, np.float64]: val = bed.read(order=order, dtype=dtype) properties = bed.properties val[-1, 0] = float("NAN") output = str(tmp_path / "toydata.{0}{1}.cpp".format( order, "32" if dtype == np.float32 else "64")) to_bed(output, val, properties=properties, count_A1=count_A1) val2 = open_bed(output, count_A1=count_A1).read(dtype=dtype) assert np.allclose(val, val2, equal_nan=True)
def test_coverage2(shared_datadir, tmp_path): with open_bed(shared_datadir / "plink_sim_10s_100v_10pmiss.bed", properties={"iid": None}) as bed: assert bed.iid is None with pytest.raises(ValueError): open_bed( shared_datadir / "plink_sim_10s_100v_10pmiss.bed", properties={ "iid": [1, 2, 3], "mother": [1, 2] }, ) val = np.zeros((3, 5))[::2] assert not val.flags["C_CONTIGUOUS"] and not val.flags["F_CONTIGUOUS"] with pytest.raises(ValueError): to_bed(tmp_path / "ignore", val) val = np.zeros((3, 5), dtype=np.str_) with pytest.raises(ValueError): to_bed(tmp_path / "ignore", val)
def test_zero_files(tmp_path): for force_python_only in [False, True]: for iid_count in [3, 0]: for sid_count in [0, 5]: for dtype in [np.int8, np.float32, np.float64]: val = np.zeros((iid_count, sid_count), dtype=dtype) if iid_count * sid_count > 0: val[0, 0] = 2 val[0, 1] = -127 if np.dtype(dtype) == np.int8 else np.nan filename = str(tmp_path / "zero_files.bed") # Write to_bed(filename, val, force_python_only=force_python_only) # Read with open_bed(filename) as bed2: val2 = bed2.read(dtype=dtype) assert np.allclose(val, val2, equal_nan=True) properties2 = bed2.properties for prop in properties2.values(): assert len(prop) in {iid_count, sid_count} # Change properties and write again if iid_count > 0: properties2["iid"][0] = "iidx" if sid_count > 0: properties2["sid"][0] = "sidx" to_bed( filename, val2, properties=properties2, force_python_only=force_python_only, ) # Read again with open_bed(filename) as bed3: val3 = bed3.read(dtype=dtype) assert np.allclose(val, val3, equal_nan=True) properties3 = bed3.properties for key2, value_list2 in properties2.items(): value_list3 = properties3[key2] assert np.array_equal(value_list2, value_list3)
def test_fam_bim_filepath(shared_datadir, tmp_path): with open_bed(shared_datadir / "small.bed") as bed: val = bed.read() properties = bed.properties output_file = tmp_path / "small.deb" fam_file = tmp_path / "small.maf" bim_file = tmp_path / "small.mib" to_bed( output_file, val, properties=properties, fam_filepath=fam_file, bim_filepath=bim_file, ) assert output_file.exists() and fam_file.exists() and bim_file.exists() with open_bed(output_file, fam_filepath=fam_file, bim_filepath=bim_file) as deb: val2 = deb.read() properties2 = deb.properties assert np.allclose(val, val2, equal_nan=True) for key in properties: np.array_equal(properties[key], properties2[key])
def test_write1(tmp_path, shared_datadir): in_file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" out_file = tmp_path / "out.bed" with open_bed(in_file) as bed: val0 = bed.read() properties0 = { "fid": bed.fid, "iid": bed.iid, "sid": bed.sid, "chromosome": bed.chromosome, "cm_position": bed.cm_position, "bp_position": bed.bp_position, } to_bed(out_file, val0, properties=properties0) with open_bed(out_file) as bed1: assert np.allclose(val0, bed1.read(), equal_nan=True) assert np.array_equal(bed.fid, properties0["fid"]) assert np.array_equal(bed.iid, properties0["iid"]) assert np.array_equal(bed.sid, properties0["sid"]) assert np.issubdtype(bed.sid.dtype, np.str_) assert np.array_equal(bed.chromosome, properties0["chromosome"]) assert np.allclose(bed.cm_position, properties0["cm_position"]) assert np.allclose(bed.bp_position, properties0["bp_position"]) val_float = val0.astype("float") val_float[0, 0] = 0.5 for force_python_only in [False, True]: with pytest.raises(ValueError): to_bed( out_file, val_float, properties=properties0, force_python_only=force_python_only, ) val_int8 = val0.astype("int8") val_int8[0, 0] = -1 for force_python_only in [False, True]: with pytest.raises(ValueError): to_bed( out_file, val_int8, properties=properties0, force_python_only=force_python_only, )
def write_bed_file( simulated_individuals, bim_SNP_names, output_name, bim_SNP_complete_pos, bim_SNP_nucleotides, population_ID, test_functionality, ): """ Purpose ------- to write the simulated data into realistic (bed, bim, fam) filesets. Does not include phenotypes at this point. Parameters ---------- sampled_individuals: an Nx(B+1)xS numpy array containing N sets of B whole chromosomes, each of which have S snps. Each row in the ith (B+1)xS subset will contribute one genomic segment. Those (B+1) genomic segments will be concatenated to comprise the ith simulated individual. bim_SNP_names: a list of SNP's rsIDs from the input bed file. output_name: name of the output bed file, which annotates the chromosome that it belongs to. bim_SNP_complete_pos: Sx3 numpy array. Columns comprise the first, third, and fourth columns from the input bim file. bim_SNP_nucleotides: Sx2 numpy array. Columns comprise the fifth and sixth columns from the input bim file (i.e. major and minor alleles). CAUTION: minor alleles with frequencies near 50% may become the major allele after the simulation because the simulated allele frequency always deviates from the real allele frequency by a small ammout. This makes plink flip the sign of r values for simulated SNP pairs relative to real SNP pairs if plink's --keep-allele-order flag is not used when computing the r values with plink. population_ID: An input argument that is concatenated to each sample's row index to comprise columns 1 and 2 for the output fam file. If no input argument is selected, then it includes the popilation ID from the 1000 genomes input plink fileset. If the input plink files are custom, then it includes an empty string as the population_ID. Returns ------- It returns nothing. It only writes the simulated data into plink files. """ simulated_IDs = np.array([ population_ID + "_" + str(i) for i in range(1, len(simulated_individuals) + 1) ]) metadata = { "fid": simulated_IDs, "iid": simulated_IDs, "sex": np.array([2] * len(simulated_IDs)), "pheno": np.array([-9] * len(simulated_IDs)), "chromosome": bim_SNP_complete_pos.T[0], "sid": bim_SNP_names, "cm_position": bim_SNP_complete_pos.T[1], "bp_position": bim_SNP_complete_pos.T[2], "allele_1": bim_SNP_nucleotides.T[0], "allele_2": bim_SNP_nucleotides.T[1], } to_bed(output_name, simulated_individuals, properties=metadata, count_A1=True) if test_functionality == "test_units": bed_reader = open_bed(output_name, count_A1=True, num_threads=1) output_bed_file = bed_reader.read(dtype="int8") output_bim_file = (pd.read_csv(output_name[:-4] + ".bim", delimiter="\t", header=None, dtype=str).to_numpy().astype("str")) output_fam_file = (pd.read_csv(output_name[:-4] + ".fam", delimiter=" ", header=None, dtype=str).to_numpy().astype("str")) unit_tester(output_bed_file, "correct_write_bed_file_output.bed", None) unit_tester(output_bim_file, "correct_write_bed_file_output.bim", None) unit_tester(output_fam_file, "correct_write_bed_file_output.fam", None)
def write( filename, snpdata, count_A1=False, force_python_only=False, _require_float32_64=True, num_threads=None, reverse_chrom_map = {}, ): """Writes a :class:`SnpData` to Bed format and returns the :class:`.Bed`. :param filename: the name of the file to create :type filename: string :param snpdata: The in-memory data that should be written to disk. :type snpdata: :class:`SnpData` :param count_A1: Tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :param force_python_only: Defaults to False. Tells to use Python code rather than faster Rust code. (Might be useful for debugging). :type force_python_only: bool :param _require_float32_64: Defaults to True. Requires that snpdata's dtype is float32 or float64. (False is useful for writing int8 data.) :type _require_float32_64: bool :param num_threads: Maximum number of threads to use. Currently ignored and number of threads is 1. :type num_threads: int :param reverse_chrom_map: Dictionary from chromosome number to chromsome string to write in the \*.bim file. Defaults to empty dictionary. :type reverse_chrom_map: dictionary :rtype: :class:`.Bed` Any :attr:`pos` values of NaN will be written as 0, the PLINK standard for missing chromosome and position values. >>> from pysnptools.snpreader import Pheno, Bed >>> import pysnptools.util as pstutil >>> from pysnptools.util import example_file # Download and return local file name >>> bed_fn = example_file("pysnptools/examples/toydata.5chrom.bed") >>> snpdata = Bed(bed_fn)[:,::2].read() # Read every-other SNP >>> pstutil.create_directory_if_necessary("tempdir/everyother.bed") >>> Bed.write("tempdir/everyother.bed",snpdata,count_A1=False) # Write data in Bed format Bed('tempdir/everyother.bed',count_A1=False) >>> # Can write from an int8 array, too. >>> snpdata_int = SnpData(val=np.int_(snpdata.val).astype('int8'),iid=snpdata.iid,sid=snpdata.sid,pos=snpdata.pos,_require_float32_64=False) >>> snpdata_int.val.dtype dtype('int8') >>> Bed.write("tempdir/everyother.bed",snpdata_int,count_A1=False,_require_float32_64=False) Bed('tempdir/everyother.bed',count_A1=False) """ if isinstance(filename, SnpData) and isinstance( snpdata, str ): # For backwards compatibility, reverse inputs if necessary warnings.warn( "write statement should have filename before data to write", DeprecationWarning, ) filename, snpdata = snpdata, filename if count_A1 is None: warnings.warn( "'count_A1' was not set. For now it will default to 'False', but in the future it will default to 'True'", FutureWarning, ) count_A1 = False filename = SnpReader._name_of_other_file(filename,remove_suffix="bed", add_suffix="bed") chromosome = snpdata.pos[:, 0] intersection = reverse_chrom_map.keys() & chromosome if len(intersection) > 0: chromosome = chromosome.astype("object") for key in intersection: chromosome[chromosome==key]=reverse_chrom_map[key] to_bed( filename, val=snpdata.val, properties={ "fid": snpdata.iid[:, 0], "iid": snpdata.iid[:, 1], "sid": snpdata.sid, "chromosome": chromosome, "cm_position": snpdata.pos[:, 1], "bp_position": snpdata.pos[:, 2], }, count_A1=count_A1, force_python_only=force_python_only, num_threads=num_threads, ) return Bed(filename, count_A1=count_A1)