def parse_smn_file(lines): """Parse a SMNCopyNumberCaller TSV file. Args: lines(iterable(str)) Returns: list(sma_info_per_individual(dict)) """ individuals = [] header = [] for i, line in enumerate(lines): line = line.rstrip() if i == 0: # Header line header = line.split("\t") else: ind_info = dict(zip(header, line.split("\t"))) smn_ind_info = {} smn_ind_info["sample_id"] = ind_info["Sample"] smn_ind_info["is_sma"] = make_bool(ind_info["isSMA"]) smn_ind_info["is_sma_carrier"] = make_bool(ind_info["isCarrier"]) smn_ind_info["smn1_cn"] = convert_number(ind_info["SMN1_CN"]) smn_ind_info["smn2_cn"] = convert_number(ind_info["SMN2_CN"]) smn_ind_info["smn2delta78_cn"] = convert_number( ind_info["SMN2delta7-8_CN"]) smn_ind_info["smn_27134_cn"] = convert_number( ind_info["g.27134T>G_CN"]) individuals.append(smn_ind_info) return individuals
def test_make_bool_YES(): ## GIVEN a empty string a = "YES" ## WHEN converting to a boolean res = make_bool(a) ## THEN assert it is True assert res is True
def test_make_bool_nonsense(): ## GIVEN a nonsense string a = "nonsense asdlkfjalk" ## WHEN converting to a boolean res = make_bool(a) ## THEN assert it is False assert res is False
def test_make_bool_empty(): ## GIVEN a empty string a = "" ## WHEN converting to a boolean res = make_bool(a) ## THEN assert it is False assert res is False
def test_make_bool_zero(): ## GIVEN a string representing a boolean a = "0" ## WHEN converting to a boolean res = make_bool(a) ## THEN assert it is False assert res is False
def test_make_bool_one(): ## GIVEN a string representing a boolean a = "1" ## WHEN making a boolean res = make_bool(a) ## THEN assert it is True assert res is True
def parse_peddy_sex_check(lines): """Parse a .ped_check.csv file Args: lines(iterable(str)) Returns: sex_check(list(dict)) """ sex_check = [] header = [] for i, line in enumerate(lines): line = line.rstrip() if i == 0: # Header line header = line.lstrip("#").split(",") else: ind_info = dict(zip(header, line.split(","))) # boolean indicating wether there is a mismatch between X # genotypes and ped sex. ind_info["error"] = make_bool(ind_info.get("error")) # number of homozygous-alternate calls ind_info["hom_alt_count"] = convert_number( ind_info["hom_alt_count"]) # number of homozygous-reference calls ind_info["hom_ref_count"] = convert_number( ind_info["hom_ref_count"]) # number of heterozygote calls ind_info["het_count"] = convert_number(ind_info["het_count"]) # ratio of het_count / hom_alt_count. Low for males, high for females ind_info["het_ratio"] = convert_number(ind_info["het_ratio"]) sex_check.append(ind_info) return sex_check
def parse_peddy_sex_check(lines): """Parse a .ped_check.csv file Args: lines(iterable(str)) Returns: sex_check(list(dict)) """ sex_check = [] header = [] for i,line in enumerate(lines): line = line.rstrip() if i == 0: # Header line header = line.lstrip('#').split(',') else: ind_info = dict(zip(header, line.split(','))) # boolean indicating wether there is a mismatch between X # genotypes and ped sex. ind_info['error'] = make_bool(ind_info.get('error')) # number of homozygous-alternate calls ind_info['hom_alt_count'] = convert_number(ind_info['hom_alt_count']) #number of homozygous-reference calls ind_info['hom_ref_count'] = convert_number(ind_info['hom_ref_count']) # number of heterozygote calls ind_info['het_count'] = convert_number(ind_info['het_count']) # ratio of het_count / hom_alt_count. Low for males, high for females ind_info['het_ratio'] = convert_number(ind_info['het_ratio']) sex_check.append(ind_info) return sex_check
def parse_peddy_ped_check(lines): """Parse a .ped_check.csv file Args: lines(iterable(str)) Returns: ped_check(list(dict)) """ ped_check = [] header = [] for i,line in enumerate(lines): line = line.rstrip() if i == 0: # Header line header = line.lstrip('#').split(',') else: pair_info = dict(zip(header, line.split(','))) # the number of sites at which sample_a was heterozygous pair_info['hets_a'] = convert_number(pair_info['hets_a']) # the number of sites at which sample_b was heterozygous pair_info['hets_b'] = convert_number(pair_info['hets_b']) # the number of sites at which the 2 samples shared no alleles # (should approach 0 for parent-child pairs). pair_info['ibs0'] = convert_number(pair_info['ibs0']) # the number of sites and which the 2 samples where both # hom-ref, both het, or both hom-alt. pair_info['ibs2'] = convert_number(pair_info['ibs2']) # the number of sites that was used to predict the relatedness. pair_info['n'] = convert_number(pair_info['n']) # the relatedness reported in the ped file. pair_info['rel'] = convert_number(pair_info['rel']) # the relatedness reported in the ped file. pair_info['pedigree_relatedness'] = convert_number(pair_info['pedigree_relatedness']) # difference between the preceding 2 colummns. pair_info['rel_difference'] = convert_number(pair_info['rel_difference']) # the number of sites at which both samples were hets. pair_info['shared_hets'] = convert_number(pair_info['shared_hets']) # boolean indicating that this pair is a parent-child pair # according to the ped file. pair_info['pedigree_parents'] = make_bool(pair_info.get('pedigree_parents')) # boolean indicating that this pair is expected to be a parent-child # pair according to the ibs0 (< 0.012) calculated from the genotypes. pair_info['predicted_parents'] = make_bool(pair_info.get('predicted_parents')) # boolean indicating that the preceding 2 columns do not match pair_info['parent_error'] = make_bool(pair_info.get('parent_error')) # boolean indicating that rel > 0.75 and ibs0 < 0.012 pair_info['sample_duplication_error'] = make_bool(pair_info.get('sample_duplication_error')) ped_check.append(pair_info) return ped_check
def parse_peddy_ped_check(lines): """Parse a .ped_check.csv file Args: lines(iterable(str)) Returns: ped_check(list(dict)) """ ped_check = [] header = [] for i, line in enumerate(lines): line = line.rstrip() if i == 0: # Header line header = line.lstrip("#").split(",") else: pair_info = dict(zip(header, line.split(","))) # the number of sites at which sample_a was heterozygous pair_info["hets_a"] = convert_number(pair_info["hets_a"]) # the number of sites at which sample_b was heterozygous pair_info["hets_b"] = convert_number(pair_info["hets_b"]) # the number of sites at which the 2 samples shared no alleles # (should approach 0 for parent-child pairs). pair_info["ibs0"] = convert_number(pair_info["ibs0"]) # the number of sites and which the 2 samples where both # hom-ref, both het, or both hom-alt. pair_info["ibs2"] = convert_number(pair_info["ibs2"]) # the number of sites that was used to predict the relatedness. pair_info["n"] = convert_number(pair_info["n"]) # the relatedness reported in the ped file. pair_info["rel"] = convert_number(pair_info["rel"]) # the relatedness reported in the ped file. pair_info["pedigree_relatedness"] = convert_number( pair_info["pedigree_relatedness"]) # difference between the preceding 2 colummns. pair_info["rel_difference"] = convert_number( pair_info["rel_difference"]) # the number of sites at which both samples were hets. pair_info["shared_hets"] = convert_number(pair_info["shared_hets"]) # boolean indicating that this pair is a parent-child pair # according to the ped file. pair_info["pedigree_parents"] = make_bool( pair_info.get("pedigree_parents")) # boolean indicating that this pair is expected to be a parent-child # pair according to the ibs0 (< 0.012) calculated from the genotypes. pair_info["predicted_parents"] = make_bool( pair_info.get("predicted_parents")) # boolean indicating that the preceding 2 columns do not match pair_info["parent_error"] = make_bool( pair_info.get("parent_error")) # boolean indicating that rel > 0.75 and ibs0 < 0.012 pair_info["sample_duplication_error"] = make_bool( pair_info.get("sample_duplication_error")) ped_check.append(pair_info) return ped_check