def test_load_families(self): """ check that load_families works correctly """ # construct a temporary family that will have the same sample IDs etc # as for the one loaded from the ped file. family = Family("fam_ID") family.add_child("proband", 'dad', 'mom', 'F', '2', "/path/to/proband_vcf.gz") family.add_mother("mom", '0', '0', 'F', '1', "/path/to/mom_vcf.gz") family.add_father("dad", '0', '0', 'M', '1', "/path/to/dad_vcf.gz") # load the ped file, and check that the load_families function returns # the expected Family object self.assertEqual(load_families(self.path), [family]) # add an extra family, with multiple sibs self.tempfile.write("fam_ID2 proband2 dad2 mom2 F 2 /path/to/proband2_vcf.gz\n") self.tempfile.write("fam_ID2 dad2 0 0 M 1 /path/to/dad2_vcf.gz\n") self.tempfile.write("fam_ID2 mom2 0 0 F 1 /path/to/mom2_vcf.gz\n") self.tempfile.write("fam_ID2 sib dad2 mom2 F 2 /path/to/sib_vcf.gz\n") self.tempfile.flush() # construct a temporary family that will have the same sample IDs etc # as for the one loaded from the ped file. fam2 = Family("fam_ID2") fam2.add_child("proband2", 'dad2', 'mom2', 'F', '2', "/path/to/proband2_vcf.gz") fam2.add_child("sib", 'dad2', 'mom2', 'F', '2', "/path/to/sib_vcf.gz") fam2.add_mother("mom2", '0', '0', 'F', '1', "/path/to/mom2_vcf.gz") fam2.add_father("dad2", '0', '0', 'M', '1', "/path/to/dad2_vcf.gz") # load the ped file, and check that the load_families function returns # the expected Families objects self.assertEqual(sorted(load_families(self.path)), sorted([family, fam2]))
def test_load_families(self): """ check that load_families works correctly """ # construct a temporary family that will have the same sample IDs etc # as for the one loaded from the ped file. family = Family("fam_ID") family.add_child("proband", "/path/to/proband_vcf.gz", "2", "F") family.add_mother("mom", "/path/to/mom_vcf.gz", "1", "F") family.add_father("dad", "/path/to/dad_vcf.gz", "1", "M") # load the ped file, and check that the load_families function returns # the expected Family object families = load_families(self.path) self.assertEqual(families, {"fam_ID": family}) # add an extra family, with multiple sibs self.tempfile.write("fam_ID2 proband2 dad2 mom2 F 2 /path/to/proband2_vcf.gz\n") self.tempfile.write("fam_ID2 dad2 0 0 M 1 /path/to/dad2_vcf.gz\n") self.tempfile.write("fam_ID2 mom2 0 0 F 1 /path/to/mom2_vcf.gz\n") self.tempfile.write("fam_ID2 sib dad2 mom2 F 2 /path/to/sib_vcf.gz\n") self.tempfile.flush() # construct a temporary family that will have the same sample IDs etc # as for the one loaded from the ped file. fam2 = Family("fam_ID2") fam2.add_child("proband2", "/path/to/proband2_vcf.gz", "2", "F") fam2.add_child("sib", "/path/to/sib_vcf.gz", "2", "F") fam2.add_mother("mom2", "/path/to/mom2_vcf.gz", "1", "F") fam2.add_father("dad2", "/path/to/dad2_vcf.gz", "1", "M") # load the ped file, and check that the load_families function returns # the expected Families objects families = load_families(self.path) self.assertEqual(set(families.values()), set([family, fam2]))
def split_pedigree_file(tempdir, ped_path, number_of_jobs, exclude_parents, use_singletons_with_parents): """ split the ped file into multiple smaller ped files Args: tempname: string for the output path ped_path: path to pedigree file number_of_jobs: how many computational jobs to split the families over. Note that due to how the families are striuctured (siblings etc), we might get more files than this exclude_parents: true/false for whether to exclude parents from the run. use_singletons_with_parents: true/false for whether to exclude probands who have parents define, but where one or both parents genotypes are not yet available. Returns: The number of files that the cohort has been split across (which will now be the number of jobs to run). """ families = load_families(ped_path) if not use_singletons_with_parents: families = [x for x in families if not is_singleton_without_parents(x)] # figure out how many families to include per file, in order to make the # correct number of jobs max_families = float(len(families)) / float(number_of_jobs) files_n = 1 families_n = 0 for family in families: families_n += 1 if families_n > max_families: files_n += 1 families_n = 1 if families_n == 1: output_file = open(os.path.join(tempdir, "{}.ped".format(files_n)), "w") for person in family: if person is None: continue line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( person.family_id, person.get_id(), person.dad_id, person.mom_id, person.get_gender(), person.get_affected_status(), person.get_path()) output_file.write(line) return files_n
def split_pedigree_file(tempdir, ped_path, number_of_jobs, exclude_parents, use_singletons_with_parents): """ split the ped file into multiple smaller ped files Args: tempname: string for the output path ped_path: path to pedigree file number_of_jobs: how many computational jobs to split the families over. Note that due to how the families are striuctured (siblings etc), we might get more files than this exclude_parents: true/false for whether to exclude parents from the run. use_singletons_with_parents: true/false for whether to exclude probands who have parents define, but where one or both parents genotypes are not yet available. Returns: The number of files that the cohort has been split across (which will now be the number of jobs to run). """ families = load_families(ped_path) if not use_singletons_with_parents: families = [ x for x in families if not is_singleton_without_parents(x) ] # figure out how many families to include per file, in order to make the # correct number of jobs max_families = float(len(families))/float(number_of_jobs) files_n = 1 families_n = 0 for family in families: families_n += 1 if families_n > max_families: files_n += 1 families_n = 1 if families_n == 1: output_file = open(os.path.join(tempdir, "{}.ped".format(files_n)), "w") for person in family: if person is None: continue line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(person.family_id, person.get_id(), person.dad_id, person.mom_id, person.get_gender(), person.get_affected_status(), person.get_path()) output_file.write(line) return files_n
def load_trio_paths(self): """sets the paths to the VCF files for a trio, or multiple trios. """ if self.options.ped is None: family = ped.Family("blank_family_ID") family.add_child("child", self.options.child, "2", self.options.gender) if self.options.mother is not None: family.add_mother("mother", self.options.mother, self.options.mom_aff, "2") if self.options.father is not None: family.add_father("father", self.options.father, self.options.dad_aff, "1") self.families = {family.family_id: family} else: self.families = ped.load_families(self.options.ped)
def get_families(args): """ loads a list of Family objects for multiple families, or a single trio """ if args.ped is None: fam_id = 'blank_family_ID' family = Family(fam_id) family.add_child('child', args.mother, args.father, args.gender, '2', args.child) if args.mother is not None: family.add_mother('mother', '0', '0', '2', args.mom_aff, args.mother) if args.father is not None: family.add_father('father', '0', '0', '1', args.dad_aff, args.father) families = [family] else: families = load_families(args.ped) return families
def load_ped(ped_path, proband_id): """ loads the pedigree details for a prband Args: ped_path: path to pedigree file for cohort proband_ids: list of person_ids for probands of interest """ families = load_families(ped_path) families = [ f for f in families for x in f.children if x.get_id() == proband_id ] family = families[0] to_line = lambda x: '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( x.family_id, x.get_id(), x.dad_id, x.mom_id, x.get_gender(), x.get_affected_status(), x.get_path()) return [to_line(x) for x in family if x is not None]