Ejemplo n.º 1
0
def test_combine_families_sex_mismatch():
    families_A = FamiliesLoader.load_pedigree_file(
        relative_to_this_test_folder("fixtures/pedigree_A.ped"))
    families_D = FamiliesLoader.load_pedigree_file(
        relative_to_this_test_folder("fixtures/pedigree_D.ped"))
    with pytest.raises(AssertionError):
        FamiliesData.combine(families_A, families_D, forced=False)
Ejemplo n.º 2
0
    def __init__(self, pheno_id: str, phenotype_data: Iterable[PhenotypeData]):
        super(PhenotypeGroup, self).__init__(pheno_id)
        self.phenotype_data = phenotype_data
        self.families = FamiliesData.combine_studies(self.phenotype_data)
        instruments, measures = self._merge_instruments(
            [ph.instruments for ph in self.phenotype_data])
        print(instruments)
        self._instruments.update(instruments)
        print(self._instruments)

        self._measures.update(measures)
Ejemplo n.º 3
0
 def _build_families(self):
     families = dict()
     families_details = self.rest_client.get_all_family_details(
         self._remote_study_id)
     for family in families_details:
         family_id = family["family_id"]
         person_jsons = family["members"]
         family_members = []
         for person_json in person_jsons:
             family_members.append(Person(**person_json))
         families[family_id] = Family.from_persons(family_members)
     self._families = FamiliesData.from_families(families)
Ejemplo n.º 4
0
    def load_pedigree_file(pedigree_filename, pedigree_format={}):
        pedigree_format["ped_no_role"] = str2bool(
            pedigree_format.get("ped_no_role", False))
        pedigree_format["ped_no_header"] = str2bool(
            pedigree_format.get("ped_no_header", False))

        ped_df = FamiliesLoader.flexible_pedigree_read(pedigree_filename,
                                                       **pedigree_format)
        families = FamiliesData.from_pedigree_df(ped_df)

        FamiliesLoader._build_families_layouts(families, pedigree_format)
        FamiliesLoader._build_families_roles(families, pedigree_format)

        return families
Ejemplo n.º 5
0
def test_combine_families():
    families_A = FamiliesLoader.load_pedigree_file(
        relative_to_this_test_folder("fixtures/pedigree_A.ped"))
    families_B = FamiliesLoader.load_pedigree_file(
        relative_to_this_test_folder("fixtures/pedigree_B.ped"))
    new_families = FamiliesData.combine(families_A, families_B, forced=False)

    merged_f1 = new_families["f1"]
    assert set(merged_f1.persons.keys()) == {
        "f1.mom",
        "f1.dad",
        "f1.p1",
        "f1.s1",
        "f1.s2",
    }
Ejemplo n.º 6
0
def test_ped2parquet_patition(
    pedigree, temp_filename, global_dae_fixtures_dir
):
    filename = f"{global_dae_fixtures_dir}/pedigrees/{pedigree}"
    assert os.path.exists(filename)

    pd_filename = (
        f"{global_dae_fixtures_dir}/"
        f"partition_descriptor/partition_description.conf"
    )

    argv = [
        filename,
        "-o",
        temp_filename,
        "--pd",
        pd_filename,
    ]

    main(argv)

    assert os.path.exists(temp_filename)

    pqfile = pq.ParquetFile(temp_filename)
    schema = pqfile.schema
    assert "family_bin" in schema.names
    print(schema)

    df = pqfile.read().to_pandas()
    print(df)
    families = FamiliesData.from_pedigree_df(df)

    assert "f1" in families
    f1 = families["f1"]
    print([p.family_bin for p in f1.persons.values()])
    assert all([p.family_bin == 9 for p in f1.persons.values()])

    assert "f2" in families
    f2 = families["f2"]
    print([p.family_bin for p in f2.persons.values()])
    assert all([p.family_bin == 6 for p in f2.persons.values()])
Ejemplo n.º 7
0
def test_families_data_families_by_type(quad_persons,
                                        multigenerational_persons,
                                        simplex_persons, multiplex_persons):
    families_data = FamiliesData.from_families({
        "trio_family":
        Family.from_persons(trio_persons()),
        "quad_family":
        Family.from_persons(quad_persons),
        "multigenerational_family":
        Family.from_persons(multigenerational_persons),
        "simplex_family":
        Family.from_persons(simplex_persons),
        "multiplex_family":
        Family.from_persons(multiplex_persons),
    })
    assert families_data.families_by_type == {
        FamilyType.QUAD: {"quad_family"},
        FamilyType.TRIO: {"trio_family"},
        FamilyType.MULTIGENERATIONAL: {"multigenerational_family"},
        FamilyType.SIMPLEX: {"simplex_family"},
        FamilyType.MULTIPLEX: {"multiplex_family"},
    }
Ejemplo n.º 8
0
 def _build_families(self):
     return FamiliesData.combine_studies(self.studies)
Ejemplo n.º 9
0
def fake_families(fixture_dirname):
    ped_df = FamiliesLoader.flexible_pedigree_read(
        fixture_dirname("denovo_import/fake_pheno.ped"))
    fake_families = FamiliesData.from_pedigree_df(ped_df)
    return fake_families
Ejemplo n.º 10
0
    def load_simple_families_file(infile, ped_sep="\t"):
        fam_df = pd.read_csv(
            infile,
            sep=ped_sep,
            index_col=False,
            skipinitialspace=True,
            converters={
                "role": lambda r: Role.from_name(r),
                "gender": lambda s: Sex.from_name(s),
                "sex": lambda s: Sex.from_name(s),
            },
            dtype={
                "familyId": str,
                "personId": str
            },
            comment="#",
        )

        fam_df = fam_df.rename(columns={
            "gender": "sex",
            "personId": "person_id",
            "familyId": "family_id",
            "momId": "mom_id",
            "dadId": "dad_id",
            "sampleId": "sample_id",
        }, )

        fam_df["status"] = pd.Series(index=fam_df.index, data=1)
        fam_df.loc[fam_df.role == Role.prb, "status"] = 2
        fam_df["status"] = fam_df.status.apply(lambda s: Status.from_value(s))

        fam_df["mom_id"] = pd.Series(index=fam_df.index, data="0")
        fam_df["dad_id"] = pd.Series(index=fam_df.index, data="0")

        if "sample_id" not in fam_df.columns:
            sample_ids = pd.Series(data=fam_df["person_id"].values)
            fam_df["sample_id"] = sample_ids

        families = defaultdict(list)
        for rec in fam_df.to_dict(orient="records"):
            families[rec["family_id"]].append(rec)

        result = defaultdict(list)
        for fam_id, members in families.items():
            mom_id = None
            dad_id = None
            children = []
            for member in members:
                role = member["role"]
                if role == Role.mom:
                    mom_id = member["person_id"]
                elif role == Role.dad:
                    dad_id = member["person_id"]
                else:
                    assert role in set([Role.prb, Role.sib])
                    children.append(member)
            for child in children:
                child["mom_id"] = mom_id
                child["dad_id"] = dad_id

            result[fam_id] = [Person(**member) for member in members]

        return FamiliesData.from_family_persons(result)
Ejemplo n.º 11
0
 def _load_families(self):
     families = defaultdict(list)
     persons = self.get_persons()
     for p in list(persons.values()):
         families[p.family_id].append(p)
     self.families = FamiliesData.from_family_persons(families)
Ejemplo n.º 12
0
    def _flexible_denovo_load_internal(
            cls,
            filepath: str,
            genome: Genome,
            families: FamiliesData,
            denovo_location: Optional[str] = None,
            denovo_variant: Optional[str] = None,
            denovo_chrom: Optional[str] = None,
            denovo_pos: Optional[str] = None,
            denovo_ref: Optional[str] = None,
            denovo_alt: Optional[str] = None,
            denovo_person_id: Optional[str] = None,
            denovo_family_id: Optional[str] = None,
            denovo_best_state: Optional[str] = None,
            denovo_genotype: Optional[str] = None,
            denovo_sep: str = "\t",
            adjust_chrom_prefix=None,
            **kwargs) -> pd.DataFrame:
        """
        Read a text file containing variants in the form
        of delimiter-separted values and produce a dataframe.

        The text file may use different names for the columns
        containing the relevant data - these are specified
        with the provided parameters.

        :param str filepath: The path to the DSV file to read.

        :param genome: A reference genome object.

        :param str denovo_location: The label or index of the column containing
        the CSHL-style location of the variant.

        :param str denovo_variant: The label or index of the column containing
        the CSHL-style representation of the variant.

        :param str denovo_chrom: The label or index of the column containing
        the chromosome upon which the variant is located.

        :param str denovo_pos: The label or index of the column containing the
        position upon which the variant is located.

        :param str denovo_ref: The label or index of the column containing the
        variant's reference allele.

        :param str denovo_alt: The label or index of the column containing the
        variant's alternative allele.

        :param str denovo_person_id: The label or index of the column
        containing either a singular person ID or a comma-separated
        list of person IDs.

        :param str denovo_family_id: The label or index of the column
        containing a singular family ID.

        :param str denovo_best_state: The label or index of the column
        containing the best state for the variant.

        :param str families: An instance of the FamiliesData class for the
        pedigree of the relevant study.

        :type genome: An instance of Genome.

        :return: Dataframe containing the variants, with the following
        header - 'chrom', 'position', 'reference', 'alternative', 'family_id',
        'genotype'.

        :rtype: An instance of Pandas' DataFrame class.
        """

        assert families is not None
        assert isinstance(
            families, FamiliesData
        ), "families must be an instance of FamiliesData!"
        assert genome, "You must provide a genome object!"

        if not (denovo_location or (denovo_chrom and denovo_pos)):
            denovo_location = "location"

        if not (denovo_variant or (denovo_ref and denovo_alt)):
            denovo_variant = "variant"

        if not (denovo_person_id or
                (denovo_family_id and
                    (denovo_best_state or denovo_genotype))):
            denovo_family_id = "familyId"
            denovo_best_state = "bestState"

        if denovo_sep is None:
            denovo_sep = "\t"

        with warnings.catch_warnings(record=True) as _:
            warnings.filterwarnings(
                "ignore",
                category=pd.errors.ParserWarning,
                message="Both a converter and dtype were specified",
            )

            raw_df = pd.read_csv(
                filepath,
                sep=denovo_sep,
                converters={
                    denovo_pos: lambda p: int(p) if p else None,
                },
                dtype=str,
                comment="#",
                encoding="utf-8",
                na_filter=False)

        if denovo_location:
            chrom_col, pos_col = zip(
                *map(cls.split_location, raw_df[denovo_location])
            )
        else:
            chrom_col = raw_df.loc[:, denovo_chrom]
            pos_col = raw_df.loc[:, denovo_pos]

        if adjust_chrom_prefix is not None:
            chrom_col = tuple(map(adjust_chrom_prefix, chrom_col))

        if denovo_variant:
            variant_col = raw_df.loc[:, denovo_variant]
            ref_alt_tuples = [
                dae2vcf_variant(
                    variant_tuple[0], variant_tuple[1], variant_tuple[2],
                    genome.get_genomic_sequence()
                ) for variant_tuple in zip(chrom_col, pos_col, variant_col)
            ]
            pos_col, ref_col, alt_col = zip(*ref_alt_tuples)

        else:
            ref_col = raw_df.loc[:, denovo_ref]
            alt_col = raw_df.loc[:, denovo_alt]

        extra_attributes_cols = raw_df.columns.difference([
            denovo_location, denovo_variant, denovo_chrom, denovo_pos,
            denovo_ref, denovo_alt, denovo_person_id, denovo_family_id,
            denovo_best_state, denovo_genotype
        ])

        if denovo_person_id:
            temp_df = pd.DataFrame(
                {
                    "chrom": chrom_col,
                    "pos": pos_col,
                    "ref": ref_col,
                    "alt": alt_col,
                    "person_id": raw_df.loc[:, denovo_person_id],
                }
            )

            grouped = temp_df.groupby(["chrom", "pos", "ref", "alt"])

            result = []

            # TODO Implement support for multiallelic variants
            for variant, variants_indices in grouped.groups.items():
                # Here we join and then split again by ',' to handle cases
                # where the person IDs are actually a list of IDs, separated
                # by a ','
                person_ids = ",".join(
                    temp_df.iloc[variants_indices].loc[:, "person_id"]
                ).split(",")

                variant_families = families.families_query_by_person_ids(
                    person_ids)

                # TODO Implement support for multiallelic variants

                for family_id, family in variant_families.items():
                    family_dict = {
                        "chrom": variant[0],
                        "position": variant[1],
                        "reference": variant[2],
                        "alternative": variant[3],
                        "family_id": family_id,
                        "genotype": cls.produce_genotype(
                            variant[0],
                            variant[1],
                            genome,
                            family,
                            person_ids,
                        ),
                        "best_state": None,
                    }
                    record = raw_df.loc[variants_indices[0]]
                    extra_attributes = record[extra_attributes_cols].to_dict()

                    result.append({**family_dict, **extra_attributes})

            denovo_df = pd.DataFrame(result)

        else:
            family_col = raw_df.loc[:, denovo_family_id]
            if denovo_best_state:
                best_state_col = list(
                    map(
                        lambda bs: str2mat(bs, col_sep=" "),  # type: ignore
                        raw_df[denovo_best_state],
                    )
                )
                # genotype_col = list(map(best2gt, best_state_col))

                denovo_df = pd.DataFrame(
                    {
                        "chrom": chrom_col,
                        "position": pos_col,
                        "reference": ref_col,
                        "alternative": alt_col,
                        "family_id": family_col,
                        "genotype": None,
                        "best_state": best_state_col,
                    }
                )
            else:
                assert denovo_genotype
                genotype_col = list(
                    map(
                        lambda gts: str2gt(gts),  # type: ignore
                        raw_df[denovo_genotype],
                    )
                )
                # genotype_col = list(map(best2gt, best_state_col))

                denovo_df = pd.DataFrame(
                    {
                        "chrom": chrom_col,
                        "position": pos_col,
                        "reference": ref_col,
                        "alternative": alt_col,
                        "family_id": family_col,
                        "genotype": genotype_col,
                        "best_state": None,
                    }
                )

            extra_attributes_df = raw_df[extra_attributes_cols]
            denovo_df = denovo_df.join(extra_attributes_df)

        return (denovo_df, extra_attributes_cols.tolist())