Exemple #1
0
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name):
    """
    Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random
    set of snps of total == pre-defined int, where the int is set to snps_to_id.

    :param memory_location: Location of bgen memory file
    :type memory_location: Path | str

    :param snps_to_id: Location of snps csv to id
    :type snps_to_id: Path | str

    :param gen_path: The path to the genetic file
    :type gen_path: Path | str

    :param write_dir: The directory to write the snp index csv file to
    :type write_dir: Path | str

    :param file_name: The name of the snp index file
    :type file_name: str

    :return: Nothing, write the id's to a csv then stop
    :rtype: None

    :raise TypeError: If a str / int is not passed
    """

    # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref
    custom_meta_path(validate_path(memory_location))
    gen = Bgen(str(validate_path(gen_path).absolute()))

    # Construct a lookup dict for variant_id-rsid
    v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]}

    # Load the list of snps to validate
    snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0]

    # Get the index of each snp that is present
    snp_indexes = []
    for snp in snps_list:
        try:
            snp_indexes.append(
                gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist())
        except KeyError:
            pass

    # Write the snp indexes out
    write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes)
    print(
        f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}"
    )
Exemple #2
0
    file = ResOut(validate_path(output_directory), "ResidualsOVERIDE")

    # Construct IO stream to write out to and write the header of Snp + [IID1, IID2, ... IID(N)].
    # Bgen files store [variant id, rs_id], we just want the rs_id hence the [1]; see https://bit.ly/2J0C1kC
    # Loader will utilise load_variants of the same file as referenced in load gen line 158 which makes it bgen, bed
    # and non .mmm file compliant.
    file.write_from_list(["Snp"] + [iid for fid, iid in gen.iid])

    # For each snp in the current chromosome file
    for i in range(gen.sid_count):

        if i % 100 == 0:
            print(f"{i}/{gen.sid_count}: {terminal_time()}")

        # Instance the memory for all individuals (:) for snp i
        current_snp = gen[:, gen.sid_to_index([gen.sid[i]])]

        # Transform bgen dosage of [0, 1, 0] -> 0, 1, or 2 respectively.
        dosage = sum(
            np.array([
                snp * i
                for i, snp in enumerate(current_snp.read(dtype=np.int8).val.T)
            ],
                     dtype=np.int8))

        # Set the dosage data into an array with covariant's
        df = pd.DataFrame([dosage[0]]).T
        df.columns = ["Dosage"]
        df["Gender"] = gender
        df["District"] = district