Esempio n. 1
0
def test_extra_attributes_serialization_deserialization(
        fixtures_gpf_instance, fixture_dirname):
    families_data = FamiliesLoader.load_simple_families_file(
        fixture_dirname("backends/iossifov_extra_attrs.ped"))

    loader = DenovoLoader(
        families_data, fixture_dirname("backends/iossifov_extra_attrs.tsv"),
        fixtures_gpf_instance.get_genome()
    )

    main_schema = loader.get_attribute("annotation_schema")
    extra_attributes = loader.get_attribute("extra_attributes")

    serializer = AlleleParquetSerializer(main_schema, extra_attributes)
    it = loader.full_variants_iterator()
    variant = next(it)[1][0]
    print(variant.gt)
    summary_blobs = serializer.serialize_summary_data(variant.alleles)
    scores_blob = serializer.serialize_scores_data(variant.alleles)
    variant_blob = serializer.serialize_family_variant(
        variant.alleles, summary_blobs, scores_blob
    )
    extra_blob = serializer.serialize_extra_attributes(variant)
    family = variant.family

    fv = serializer.deserialize_family_variant(
        variant_blob, family, extra_blob)

    assert fv.get_attribute("someAttr")[0] == "asdf"
Esempio n. 2
0
def test_denovo_loader_avoids_duplicates(
    genome_2013,
    fixture_dirname,
    fake_families,
):
    denovo_filename = fixture_dirname(
        "denovo_import/variants_VCF_style_dup.tsv")
    params = {
        "denovo_chrom": "chrom",
        "denovo_pos": "pos",
        "denovo_ref": "ref",
        "denovo_alt": "alt",
        "denovo_family_id": "familyId",
        "denovo_best_state": "bestState"
    }
    variants_loader = DenovoLoader(fake_families,
                                   denovo_filename,
                                   genome=genome_2013,
                                   params=params)

    vs = variants_loader.full_variants_iterator()

    svs = []
    fvs = []
    for sv, fvs_ in vs:
        print(sv, fvs)
        svs.append(sv)
        for fv in fvs_:
            fvs.append(fv)

    assert len(svs) == 3
    assert len(fvs) == 4
Esempio n. 3
0
def test_extra_attributes_loading_with_person_id(
        fixtures_gpf_instance, fixture_dirname):
    families_loader = FamiliesLoader(
        fixture_dirname("backends/denovo-db-person-id.ped"))
    families_data = families_loader.load()

    params = {
        "denovo_chrom": "Chr",
        "denovo_pos": "Position",
        "denovo_ref": "Ref",
        "denovo_alt": "Alt",
        "denovo_person_id": "SampleID"
    }

    loader = DenovoLoader(
        families_data, fixture_dirname("backends/denovo-db-person-id.tsv"),
        fixtures_gpf_instance.get_genome(),
        params=params
    )

    it = loader.full_variants_iterator()
    variants = list(it)
    assert len(variants) == 17
    family_variants = [v[1][0] for v in variants]
    assert family_variants[0].get_attribute("StudyName")[0] == "Turner_2017"
    assert family_variants[1].get_attribute("StudyName")[0] == "Turner_2017"
    assert family_variants[2].get_attribute("StudyName")[0] == "Turner_2017"
    assert family_variants[3].get_attribute("StudyName")[0] == "Lelieveld2016"
    for variant in family_variants:
        print(variant)
Esempio n. 4
0
def test_families_instance_type_assertion():
    error_message = "families must be an instance of FamiliesData!"
    with pytest.raises(AssertionError) as excinfo:
        DenovoLoader.flexible_denovo_load(
            None,
            None,
            denovo_location="foo",
            denovo_variant="bar",
            denovo_person_id="baz",
            families="bla",
        )
    assert str(excinfo.value) == error_message
Esempio n. 5
0
def test_read_variants_genome_assertion(fixture_dirname, fake_families):
    filename = fixture_dirname("denovo_import/variants_DAE_style.tsv")

    with pytest.raises(AssertionError) as excinfo:
        DenovoLoader.flexible_denovo_load(
            filename,
            None,
            families=fake_families,
            denovo_location="location",
            denovo_variant="variant",
            denovo_family_id="familyId",
            denovo_best_state="bestState",
        )

    assert str(excinfo.value) == "You must provide a genome object!"
Esempio n. 6
0
def test_families_genotypes_decorator_broken_x(fixture_dirname, genome_2013):

    families_loader = FamiliesLoader(
        fixture_dirname("backends/denovo_families.txt"),
        **{"ped_file_format": "simple"},
    )
    families = families_loader.load()

    variants_loader = DenovoLoader(
        families, fixture_dirname("backends/denovo_X_broken.txt"), genome_2013)

    for sv, fvs in variants_loader.full_variants_iterator():
        for fv in fvs:
            print(fv, fv.genetic_model)
            assert fv.genetic_model == GeneticModel.X_broken
Esempio n. 7
0
def test_read_variants_DAE_style(genome_2013, fixture_dirname, fake_families):
    filename = fixture_dirname("denovo_import/variants_DAE_style.tsv")
    res_df = DenovoLoader.flexible_denovo_load(
        filename,
        genome_2013,
        families=fake_families,
        denovo_location="location",
        denovo_variant="variant",
        denovo_family_id="familyId",
        denovo_best_state="bestState",
    )

    expected_df = pd.DataFrame({
        "chrom": ["1", "2", "2", "3", "4"],
        "position": [123, 234, 234, 345, 456],
        "reference": ["A", "T", "G", "G", "G"],
        "alternative": ["G", "A", "A", "A", "A"],
        "family_id": ["f1", "f1", "f2", "f3", "f4"],
        "genotype": [None, None, None, None, None],
        "best_state": [
            np.array([[2, 2, 1, 2, 1], [0, 0, 1, 0, 1]]),
            np.array([[2, 2, 1, 2, 2], [0, 0, 1, 0, 0]]),
            np.array([[2, 2, 2, 1], [0, 0, 0, 1]]),
            np.array([[1], [1]]),
            np.array([[1, 1], [1, 1]]),
        ],
    })

    assert compare_variant_dfs(res_df, expected_df)
Esempio n. 8
0
def test_produce_genotype(fake_families, genome_2013):
    expected_output = np.array([[0, 0, 0, 0, 0], [0, 0, 0, 1, 1]])
    output = DenovoLoader.produce_genotype("1", 123123, genome_2013,
                                           fake_families["f1"],
                                           ["f1.p1", "f1.s2"])
    assert np.array_equal(output, expected_output)
    assert output.dtype == GENOTYPE_TYPE
Esempio n. 9
0
def dae_denovo(dae_denovo_config, genome_2013, annotation_pipeline_internal):

    families_loader = FamiliesLoader(dae_denovo_config.family_filename,
                                     **{"ped_file_format": "simple"})
    families = families_loader.load()

    variants_loader = DenovoLoader(families, dae_denovo_config.denovo_filename,
                                   genome_2013)

    variants_loader = AnnotationPipelineDecorator(
        variants_loader, annotation_pipeline_internal)
    fvars = RawMemoryVariants([variants_loader])
    return fvars
Esempio n. 10
0
def iossifov2014_loader(dae_iossifov2014_config, genome_2013,
                        annotation_pipeline_internal):
    config = dae_iossifov2014_config

    families_loader = FamiliesLoader(config.family_filename)
    families = families_loader.load()

    variants_loader = DenovoLoader(families, config.denovo_filename,
                                   genome_2013)

    variants_loader = AnnotationPipelineDecorator(
        variants_loader, annotation_pipeline_internal)

    return variants_loader, families_loader
Esempio n. 11
0
def denovo_extra_attr_loader(
        fixture_dirname, genome_2013, annotation_pipeline_internal):

    families_filename = fixture_dirname("backends/iossifov_extra_attrs.ped")
    variants_filename = fixture_dirname("backends/iossifov_extra_attrs.tsv")

    families = FamiliesLoader.load_simple_families_file(families_filename)

    variants_loader = DenovoLoader(
        families, variants_filename, genome_2013)

    variants_loader = AnnotationPipelineDecorator(
        variants_loader, annotation_pipeline_internal
    )

    return variants_loader
Esempio n. 12
0
    def builder(
        path,
        params={
            "vcf_include_reference_genotypes": True,
            "vcf_include_unknown_family_genotypes": True,
            "vcf_include_unknown_person_genotypes": True,
            "vcf_denovo_mode": "denovo",
            "vcf_omission_mode": "omission",
        },
    ):
        config = vcf_loader_data(path)

        families_loader = FamiliesLoader(config.pedigree)
        families = families_loader.load()

        loaders = []

        if config.denovo:
            denovo_loader = DenovoLoader(families,
                                         config.denovo,
                                         genomes_db_2013.get_genome(),
                                         params={
                                             "denovo_genotype": "genotype",
                                             "denovo_family_id": "family",
                                             "denovo_chrom": "chrom",
                                             "denovo_pos": "pos",
                                             "denovo_ref": "ref",
                                             "denovo_alt": "alt",
                                         })
            loaders.append(
                AnnotationPipelineDecorator(denovo_loader,
                                            default_annotation_pipeline))

        vcf_loader = VcfLoader(families, [config.vcf],
                               genomes_db_2013.get_genome(),
                               params=params)

        loaders.append(
            AnnotationPipelineDecorator(vcf_loader,
                                        default_annotation_pipeline))

        return loaders
Esempio n. 13
0
def test_read_variants_person_ids(genome_2013, filename, fake_families,
                                  fixture_dirname):
    filename = fixture_dirname(filename)
    res_df = DenovoLoader.flexible_denovo_load(
        filename,
        genome_2013,
        families=fake_families,
        denovo_chrom="chrom",
        denovo_pos="pos",
        denovo_ref="ref",
        denovo_alt="alt",
        denovo_person_id="personId",
    )

    expected_df = pd.DataFrame({
        "chrom": ["1", "2", "2", "3", "4"],
        "position": [123, 234, 235, 345, 456],
        "reference": ["A", "T", "G", "G", "G"],
        "alternative": ["G", "A", "A", "A", "A"],
        "family_id": ["f1", "f1", "f2", "f3", "f4"],
        "genotype": [
            np.array([[0, 0, 0, 0, 0], [0, 0, 1, 0, 1]]),
            np.array([[0, 0, 0, 0, 0], [0, 0, 1, 0, 0]]),
            np.array([[0, 0, 0, 0], [0, 0, 0, 1]]),
            np.array([[0], [1]]),
            np.array([[0, 0], [1, 1]]),
        ],
        "best_state": [None, None, None, None, None],
    })

    print(res_df)
    print(expected_df)

    res_df = res_df.sort_values(["chrom", "position", "reference"])
    res_df = res_df.reset_index(drop=True)
    expected_df = expected_df.sort_values(["chrom", "position", "reference"])
    expected_df = expected_df.reset_index(drop=True)

    print(res_df)
    print(expected_df)

    assert compare_variant_dfs(res_df, expected_df)
Esempio n. 14
0
def test_produce_genotype_no_people_with_variants(fake_families, genome_2013):
    expected_output = np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
    output = DenovoLoader.produce_genotype("1", 123123, genome_2013,
                                           fake_families["f1"], [])
    assert np.array_equal(output, expected_output)
    assert output.dtype == GENOTYPE_TYPE
Esempio n. 15
0
def test_denovo_loader(genome_2013, fixture_dirname, fake_families, filename,
                       params):
    denovo_filename = fixture_dirname(f"denovo_import/{filename}")
    variants_loader = DenovoLoader(fake_families,
                                   denovo_filename,
                                   genome=genome_2013,
                                   params=params)

    vs = list(variants_loader.full_variants_iterator())
    print(vs)

    def falt_allele(index):
        return vs[index][1][0].alt_alleles[0]

    fa = falt_allele(0)
    print(fa, fa.variant_in_members, fa.inheritance_in_members)
    assert fa.inheritance_in_members[2] == Inheritance.denovo
    assert fa.inheritance_in_members[4] == Inheritance.denovo
    assert fa.inheritance_in_members == [
        Inheritance.unknown,
        Inheritance.unknown,
        Inheritance.denovo,
        Inheritance.missing,
        Inheritance.denovo,
    ]

    fa = falt_allele(1)
    print(fa, fa.variant_in_members, fa.inheritance_in_members)
    assert fa.inheritance_in_members[2] == Inheritance.denovo
    assert fa.inheritance_in_members == [
        Inheritance.unknown,
        Inheritance.unknown,
        Inheritance.denovo,
        Inheritance.missing,
        Inheritance.missing,
    ]

    fa = falt_allele(2)
    print(fa, fa.variant_in_members, fa.inheritance_in_members)
    assert fa.inheritance_in_members[3] == Inheritance.denovo
    assert fa.inheritance_in_members == [
        Inheritance.unknown,
        Inheritance.unknown,
        Inheritance.missing,
        Inheritance.denovo,
    ]

    fa = falt_allele(3)
    print(fa, fa.variant_in_members, fa.inheritance_in_members)

    assert fa.inheritance_in_members[0] == Inheritance.denovo
    assert fa.inheritance_in_members == [Inheritance.denovo]

    fa = falt_allele(4)
    print(fa, fa.variant_in_members, fa.inheritance_in_members)

    assert fa.inheritance_in_members[0] == Inheritance.denovo
    assert fa.inheritance_in_members == [
        Inheritance.denovo,
        Inheritance.denovo,
    ]
Esempio n. 16
0
    def build_backend(self, study_config, genomes_db):
        if not study_config.genotype_storage.files:
            data_dir = self.get_data_dir(study_config.id, "data")
            vcf_filename = os.path.join(data_dir,
                                        "{}.vcf".format(study_config.id))
            ped_filename = os.path.join(data_dir,
                                        "{}.ped".format(study_config.id))

            families_loader = FamiliesLoader(ped_filename)
            families = families_loader.load()
            variants_loader = VcfLoader(families, [vcf_filename],
                                        genomes_db.get_genome())
            variants_loader = StoredAnnotationDecorator.decorate(
                variants_loader, vcf_filename)

            return RawMemoryVariants([variants_loader], families)

        else:
            start = time.time()
            ped_params = \
                study_config.genotype_storage.files.pedigree.params.to_dict()
            ped_filename = study_config.genotype_storage.files.pedigree.path
            logger.debug(f"pedigree params: {ped_filename}; {ped_params}")

            families_loader = FamiliesLoader(ped_filename, **ped_params)
            families = families_loader.load()
            elapsed = time.time() - start
            logger.info(f"families loaded in in {elapsed:.2f} sec")
            logger.debug(f"{families.ped_df.head()}")

            loaders = []
            for file_conf in study_config.genotype_storage.files.variants:
                start = time.time()
                variants_filename = file_conf.path
                variants_params = file_conf.params.to_dict()
                logger.debug(
                    f"variant params: {variants_filename}; {variants_params}")

                annotation_filename = variants_filename
                if file_conf.format == "vcf":
                    variants_filenames = [
                        fn.strip() for fn in variants_filename.split(" ")
                    ]
                    variants_loader = VcfLoader(
                        families,
                        variants_filenames,
                        genomes_db.get_genome(),
                        params=variants_params,
                    )
                    annotation_filename = variants_filenames[0]
                if file_conf.format == "denovo":
                    variants_loader = DenovoLoader(
                        families,
                        variants_filename,
                        genomes_db.get_genome(),
                        params=variants_params,
                    )
                if file_conf.format == "dae":
                    variants_loader = DaeTransmittedLoader(
                        families,
                        variants_filename,
                        genomes_db.get_genome(),
                        params=variants_params,
                    )
                if file_conf.format == "cnv":
                    variants_loader = CNVLoader(
                        families,
                        variants_filename,
                        genomes_db.get_genome(),
                        params=variants_params,
                    )

                variants_loader = StoredAnnotationDecorator.decorate(
                    variants_loader, annotation_filename)
                loaders.append(variants_loader)

            return RawMemoryVariants(loaders, families)
Esempio n. 17
0
    def build(dirname):

        if not impala_helpers.check_database(impala_test_dbname()):
            impala_helpers.create_database(impala_test_dbname())

        vcfdirname = relative_to_this_test_folder(
            os.path.join("fixtures", dirname))
        vcf_configs = collect_vcf(vcfdirname)

        for config in vcf_configs:
            logger.debug(f"importing: {config}")

            filename = os.path.basename(config.pedigree)
            study_id = os.path.splitext(filename)[0]

            (variant_table, pedigree_table) = \
                impala_genotype_storage.study_tables(
                    FrozenBox({"id": study_id}))

            if (not reimport and impala_helpers.check_table(
                    impala_test_dbname(), variant_table)
                    and impala_helpers.check_table(impala_test_dbname(),
                                                   pedigree_table)):
                continue

            study_id = study_id_from_path(config.pedigree)
            study_temp_dirname = os.path.join(temp_dirname, study_id)

            families_loader = FamiliesLoader(config.pedigree)
            families = families_loader.load()
            genome = gpf_instance_2013.genomes_db.get_genome()

            loaders = []
            if config.denovo:
                denovo_loader = DenovoLoader(families,
                                             config.denovo,
                                             genome,
                                             params={
                                                 "denovo_genotype": "genotype",
                                                 "denovo_family_id": "family",
                                                 "denovo_chrom": "chrom",
                                                 "denovo_pos": "pos",
                                                 "denovo_ref": "ref",
                                                 "denovo_alt": "alt",
                                             })
                loaders.append(
                    AnnotationPipelineDecorator(denovo_loader,
                                                annotation_pipeline))

            vcf_loader = VcfLoader(
                families,
                [config.vcf],
                genome,
                regions=None,
                params={
                    "vcf_include_reference_genotypes": True,
                    "vcf_include_unknown_family_genotypes": True,
                    "vcf_include_unknown_person_genotypes": True,
                    "vcf_multi_loader_fill_in_mode": "reference",
                    "vcf_denovo_mode": "denovo",
                    "vcf_omission_mode": "omission",
                },
            )

            loaders.append(
                AnnotationPipelineDecorator(vcf_loader, annotation_pipeline))

            impala_genotype_storage.simple_study_import(
                study_id,
                families_loader=families_loader,
                variant_loaders=loaders,
                output=study_temp_dirname,
                include_reference=True)