def test_parquet_region_bin(fam1, gt, chromosomes, region_length,
                            summary_alleles, expected):
    sv = SummaryVariant(summary_alleles)
    fv = FamilyVariant(sv, fam1, gt, None)
    pd = ParquetPartitionDescriptor(chromosomes, region_length)
    region_bin = pd._evaluate_region_bin(fv)
    for fa in fv.alleles:
        assert region_bin == expected
        assert (pd.variant_filename(fa) ==
                f"region_bin={region_bin}/variants_region_bin_{region_bin}"
                f".parquet")
def test_parquet_frequency_bin(fam1, gt, attributes, rare_boundary, expected):
    summary_alleles = [
        SummaryAllele("1", 11539, "T", None, 0, 0, attributes=attributes)
    ] * 3
    sv = SummaryVariant(summary_alleles)
    fv = FamilyVariant(sv, fam1, gt, None)
    pd = ParquetPartitionDescriptor(["1"], 1000, rare_boundary=rare_boundary)

    for fa in fv.alleles:
        assert pd._evaluate_frequency_bin(fa) == expected
        assert (pd.variant_filename(
            fa) == f"region_bin=1_11/frequency_bin={expected}/" +
                f"variants_region_bin_1_11_frequency_bin_{expected}.parquet")
def test_parquet_family_bin(fam1, fam2, gt):
    sv = SummaryVariant(summary_alleles_chr1)
    fv1 = FamilyVariant(sv, fam1, gt, None)
    fv2 = FamilyVariant(sv, fam2, gt, None)

    family_bin_size = 10
    pd = ParquetPartitionDescriptor(["1"], 1000, family_bin_size)
    for fa1, fa2 in zip(fv1.alleles, fv2.alleles):
        assert pd._evaluate_family_bin(fa1) == 9
        assert pd._evaluate_family_bin(fa2) == 6
        assert (pd.variant_filename(fa1) == "region_bin=1_11/family_bin=9/"
                "variants_region_bin_1_11_family_bin_9.parquet")
        assert (pd.variant_filename(fa2) == "region_bin=1_11/family_bin=6/"
                "variants_region_bin_1_11_family_bin_6.parquet")
def test_region_partition(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    partition_desc = ParquetPartitionDescriptor(["1", "2"],
                                                10000,
                                                root_dirname=temp_dirname)

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_87"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_122"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_87"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_122"))
    assert os.path.exists(
        os.path.join(temp_dirname,
                     "region_bin=1_86/variants_region_bin_1_86.parquet"))
    assert os.path.exists(
        os.path.join(temp_dirname,
                     "region_bin=2_87/variants_region_bin_2_87.parquet"))
Example #5
0
def test_partition_descriptor(global_dae_fixtures_dir):
    pd_filename = (
        f"{global_dae_fixtures_dir}/"
        f"partition_descriptor/partition_description.conf"
    )
    pd = ParquetPartitionDescriptor.from_config(pd_filename)
    assert pd is not None
def test_frequency_partition_3(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    partition_desc = ParquetPartitionDescriptor(["1", "2"],
                                                10000000,
                                                rare_boundary=100,
                                                root_dirname=temp_dirname)

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=1_0", "frequency_bin=2"))
    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=2_0", "frequency_bin=2"))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_0",
            "frequency_bin=2",
            "variants_region_bin_1_0_frequency_bin_2.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_0",
            "frequency_bin=2",
            "variants_region_bin_2_0_frequency_bin_2.parquet",
        ))
def test_target_generator_del_chrom_prefix_target_chrom(
        region_length, targets, genomes_db_2019, mocker):

    mocker.patch.object(
        GenomicSequence,
        "get_all_chrom_lengths",
        return_value=[
            ("1", 100_000_000),
            ("2", 200_000_000),
            ("3", 300_000_000),
            ("4", 400_000_000),
        ],
    )

    partition_descriptor = ParquetPartitionDescriptor(["1", "2"],
                                                      region_length)

    generator = MakefilePartitionHelper(
        partition_descriptor,
        genomes_db_2019.get_genome(),
        del_chrom_prefix="chr",
    )
    print(generator.chromosome_lengths)
    assert len(generator.chromosome_lengths) == 4

    result = generator.generate_variants_targets(["1", "2"])
    print(result)
    assert set(result.keys()) == targets
def test_coding_partition_3(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    partition_desc = ParquetPartitionDescriptor(
        ["1", "2"],
        10000000,
        coding_effect_types=["asdfghjkl"],
        root_dirname=temp_dirname,
    )

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=1_0", "coding_bin=0"))
    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=2_0", "coding_bin=0"))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_0",
            "coding_bin=0",
            "variants_region_bin_1_0_coding_bin_0.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_0",
            "coding_bin=0",
            "variants_region_bin_2_0_coding_bin_0.parquet",
        ))
def test_makefile_generator_bucket_numbering(region_length, targets,
                                             genomes_db_2019, mocker):

    mocker.patch.object(
        GenomicSequence,
        "get_all_chrom_lengths",
        return_value=[
            ("chr1", 100_000_000),
            ("chr2", 200_000_000),
            ("chr3", 300_000_000),
            ("chr4", 400_000_000),
        ],
    )

    partition_descriptor = ParquetPartitionDescriptor(["chr1", "chr2"],
                                                      region_length)

    generator = MakefilePartitionHelper(
        partition_descriptor,
        genomes_db_2019.get_genome(),
        add_chrom_prefix="chr",
    )
    print(generator.chromosome_lengths)
    assert len(generator.chromosome_lengths) == 4

    for (region_bin, bucket_index) in targets:
        assert bucket_index == generator.bucket_index(region_bin)
def test_makefile_generator_regions_add_chrom_prefix(region_length, targets,
                                                     genomes_db_2019, mocker):

    mocker.patch.object(
        GenomicSequence,
        "get_all_chrom_lengths",
        return_value=[
            ("chr1", 100_000_000),
            ("chr2", 200_000_000),
            ("chr3", 300_000_000),
            ("chr4", 400_000_000),
        ],
    )

    partition_descriptor = ParquetPartitionDescriptor(["chr1", "chr2"],
                                                      region_length)

    generator = MakefilePartitionHelper(
        partition_descriptor,
        genomes_db_2019.get_genome(),
        add_chrom_prefix="chr",
    )

    print(generator.chromosome_lengths)
    assert len(generator.chromosome_lengths) == 4

    variants_targets = generator.generate_variants_targets(
        ["1", "2", "3", "4"])

    for (region_bin, regions) in targets:
        assert region_bin in variants_targets
        assert regions == variants_targets[region_bin]
Example #11
0
def test_denovo2parquet_denovo_partition(
        fixture_dirname, dae_denovo_config, temp_dirname):

    partition_description = fixture_dirname(
        "backends/example_partition_configuration.conf"
    )

    argv = [
        "--ped-file-format",
        "simple",
        "--pd",
        partition_description,
        "-o",
        temp_dirname,
        dae_denovo_config.family_filename,
        dae_denovo_config.denovo_filename,
    ]

    main(argv)

    pd = ParquetPartitionDescriptor.from_config(partition_description)
    file_glob = os.path.join(temp_dirname, pd.generate_file_access_glob())
    partition_files = glob.glob(file_glob)

    assert len(partition_files) == 5
    for file in partition_files:
        assert "frequency_bin=0" in file
Example #12
0
    def impala_load_dataset(self, study_id, variants_dir, pedigree_file):
        if variants_dir is None:
            partition_description = None
            variants_schema = None
        else:
            partition_config_file = os.path.join(variants_dir,
                                                 "_PARTITION_DESCRIPTION")
            if os.path.exists(partition_config_file):
                partition_description = ParquetPartitionDescriptor.from_config(
                    partition_config_file, root_dirname=variants_dir)
            else:
                partition_description = NoPartitionDescriptor(
                    root_dirname=variants_dir)

            variants_schema_file = os.path.join(variants_dir,
                                                "_VARIANTS_SCHEMA")
            variants_schema = None
            if os.path.exists(variants_schema_file):
                with open(variants_schema_file, "rt") as infile:
                    content = infile.read()
                    schema = toml.loads(content)
                    variants_schema = schema["variants_schema"]

        variants_hdfs_dir, variants_hdfs_path, pedigree_hdfs_path = \
            self.hdfs_upload_dataset(
                study_id, variants_dir, pedigree_file, partition_description)

        return self.impala_import_dataset(
            study_id,
            pedigree_hdfs_path,
            variants_hdfs_dir,
            partition_description=partition_description,
            variants_schema=variants_schema,
            variants_sample=variants_hdfs_path)
def test_target_generator_region_bins_count(region_length, chrom, bins_count,
                                            genomes_db_2019):

    partition_descriptor = ParquetPartitionDescriptor(["1", "2"],
                                                      region_length)

    generator = MakefilePartitionHelper(partition_descriptor,
                                        genomes_db_2019.get_genome())
    assert generator is not None
    assert generator.region_bins_count(chrom) == bins_count
def test_target_generator_chrom_1(region_length, targets, genomes_db_2019):

    partition_descriptor = ParquetPartitionDescriptor(["1", "2"],
                                                      region_length)

    generator = MakefilePartitionHelper(partition_descriptor,
                                        genomes_db_2019.get_genome())

    result = generator.generate_variants_targets(["1"])
    print(result)
    assert set(result.keys()) == targets
def test_target_generator_other_0(region_length, target_chroms, targets,
                                  genomes_db_2019):

    partition_descriptor = ParquetPartitionDescriptor(["1", "2"],
                                                      region_length)

    generator = MakefilePartitionHelper(partition_descriptor,
                                        genomes_db_2019.get_genome())

    result = generator.generate_variants_targets(target_chroms)
    print(result)
    assert result["other_0"] == targets
def test_parquet_coding_bin(fam1, gt, eff1, eff2, eff3, coding_effect_types,
                            expected):
    summary_alleles = [
        SummaryAllele("1", 11539, "T", None, 0, 0),
        SummaryAllele("1", 11539, "T", "G", 0, 1, attributes={"effects":
                                                              eff1}),
        SummaryAllele("1", 11539, "T", "C", 0, 2, attributes={"effects":
                                                              eff2}),
        SummaryAllele("1", 11539, "T", "A", 0, 3, attributes={"effects":
                                                              eff3}),
    ]
    gt = np.array([[0, 1, 0], [2, 0, 3]], dtype="int8")
    sv = SummaryVariant(summary_alleles)
    fv = FamilyVariant(sv, fam1, gt, None)
    pd = ParquetPartitionDescriptor(["1"],
                                    1000,
                                    coding_effect_types=coding_effect_types)
    for fa, ex in zip(fv.alleles, expected):
        assert pd._evaluate_coding_bin(fa) == ex
        assert (
            pd.variant_filename(fa) == f"region_bin=1_11/coding_bin={ex}/" +
            f"variants_region_bin_1_11_coding_bin_{ex}.parquet")
def test_target_generator_region_bins(region_length, chrom, targets,
                                      genomes_db_2019):

    partition_descriptor = ParquetPartitionDescriptor(["1", "2"],
                                                      region_length)

    generator = MakefilePartitionHelper(partition_descriptor,
                                        genomes_db_2019.get_genome())

    assert generator is not None
    result = generator.generate_chrom_targets(chrom)
    print(result)
    assert targets == result
def test_coding_partition_2(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    temp_dirname = "/tmp/dataset-partition-test"

    partition_desc = ParquetPartitionDescriptor(
        ["1", "2"],
        10000000,
        coding_effect_types=[
            "missense",
            "nonsense",
            "synonymous",
            "frame-shift",
        ],
        root_dirname=temp_dirname,
    )

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=1_0", "coding_bin=1"))
    assert os.path.exists(
        os.path.join(temp_dirname, "region_bin=2_0", "coding_bin=0"))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_0",
            "coding_bin=1",
            "variants_region_bin_1_0_coding_bin_1.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_0",
            "coding_bin=0",
            "variants_region_bin_2_0_coding_bin_0.parquet",
        ))
Example #19
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)
    if not genotype_storage or (genotype_storage
                                and not genotype_storage.is_impala()):
        logger.error("missing or non-impala genotype storage")
        return

    partition_descriptor = None
    if argv.variants and os.path.exists(argv.variants):
        partition_config_file = os.path.join(argv.variants,
                                             "_PARTITION_DESCRIPTION")

        if os.path.isdir(argv.variants) and \
                os.path.exists(partition_config_file):
            partition_descriptor = ParquetPartitionDescriptor.from_config(
                partition_config_file, root_dirname=argv.variants)

    if partition_descriptor is None:
        partition_descriptor = NoPartitionDescriptor(
            root_dirname=argv.variants)

    genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants,
                                         argv.pedigree, partition_descriptor)
def test_region_partition_small_region(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    partition_desc = ParquetPartitionDescriptor(["1", "2"],
                                                10,
                                                root_dirname=temp_dirname)

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86558"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86562"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86566"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_86569"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_87810"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90192"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_90595"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=1_122251"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86558"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86562"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86566"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_86569"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_87810"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90192"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_90595"))
    assert os.path.exists(os.path.join(temp_dirname, "region_bin=2_122251"))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_90595/variants_region_bin_1_90595.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_122251/variants_region_bin_2_122251.parquet",
        ))
def test_region_family_frequency(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    temp_dirname = temp_dirname

    partition_desc = ParquetPartitionDescriptor(
        ["1", "2"],
        100000,
        family_bin_size=100,
        rare_boundary=30,
        root_dirname=temp_dirname,
    )

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_8",
            "frequency_bin=2",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_8",
            "frequency_bin=2",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_12",
            "frequency_bin=3",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_12",
            "frequency_bin=3",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_8",
            "frequency_bin=2",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_8",
            "frequency_bin=2",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_9",
            "frequency_bin=2",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_9",
            "frequency_bin=2",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "family_bin=6",
            "variants_region_bin_1_9_frequency_bin_2_family_bin_6.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "family_bin=6",
            "variants_region_bin_2_12_frequency_bin_3_family_bin_6.parquet",
        ))
def test_all(vcf_variants_loaders, temp_dirname):
    fvars = vcf_variants_loaders("backends/partition")[0]

    partition_desc = ParquetPartitionDescriptor(
        ["1", "2"],
        100000,
        family_bin_size=100,
        coding_effect_types=[
            "missense",
            "nonsense",
            "frame-shift",
            "synonymous",
        ],
        rare_boundary=30,
        root_dirname=temp_dirname,
    )

    parquet_writer = VariantsParquetWriter(fvars, partition_desc)

    assert parquet_writer is not None

    parquet_writer.write_dataset()

    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_8",
            "frequency_bin=2",
            "coding_bin=1",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_8",
            "frequency_bin=2",
            "coding_bin=1",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "coding_bin=1",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "coding_bin=1",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_12",
            "frequency_bin=3",
            "coding_bin=1",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_12",
            "frequency_bin=3",
            "coding_bin=1",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_8",
            "frequency_bin=2",
            "coding_bin=0",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_8",
            "frequency_bin=2",
            "coding_bin=0",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_9",
            "frequency_bin=2",
            "coding_bin=0",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_9",
            "frequency_bin=2",
            "coding_bin=0",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "coding_bin=0",
            "family_bin=6",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "coding_bin=0",
            "family_bin=69",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=1_9",
            "frequency_bin=2",
            "coding_bin=1",
            "family_bin=6",
            "variants_region_bin_1_9_frequency_bin_2_coding_bin_1"
            "_family_bin_6.parquet",
        ))
    assert os.path.exists(
        os.path.join(
            temp_dirname,
            "region_bin=2_12",
            "frequency_bin=3",
            "coding_bin=0",
            "family_bin=6",
            "variants_region_bin_2_12_frequency_bin_3_coding_bin_0"
            "_family_bin_6.parquet",
        ))
Example #23
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)

    if not genotype_storage or not genotype_storage.is_impala():
        logger.error("missing or non-impala genotype storage")
        return

    study_id = argv.study_id

    if argv.variants is not None:
        hdfs_variants_dir = argv.variants
    elif argv.variants_sample or argv.variants_schema:
        hdfs_variants_dir = \
            genotype_storage.default_variants_hdfs_dirname(study_id)
        # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir):
        #     hdfs_variants_dir = None
    else:
        hdfs_variants_dir = None

    if argv.pedigree is not None:
        hdfs_pedigree_file = argv.pedigree
    else:
        hdfs_pedigree_file = \
            genotype_storage.default_pedigree_hdfs_filename(study_id)

    logger.info(f"HDFS variants dir: {hdfs_variants_dir}")
    logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}")

    partition_config_file = None
    if argv.partition_description is not None:
        partition_config_file = argv.partition_description
        assert os.path.isfile(partition_config_file), partition_config_file
    logger.info(f"partition_config_file: {partition_config_file}")

    if partition_config_file is not None and \
            os.path.isfile(partition_config_file):
        partition_description = ParquetPartitionDescriptor.from_config(
            partition_config_file)
    else:
        partition_description = NoPartitionDescriptor()

    variants_schema = None
    if argv.variants_schema is not None:
        assert os.path.exists(argv.variants_schema), argv.variants_schema
        assert os.path.isfile(argv.variants_schema), argv.variants_schema
        with open(argv.variants_schema) as infile:
            content = infile.read()
            schema = toml.loads(content)
            variants_schema = schema["variants_schema"]

    genotype_storage.impala_import_dataset(
        argv.study_id,
        hdfs_pedigree_file,
        hdfs_variants_dir,
        partition_description=partition_description,
        variants_sample=argv.variants_sample,
        variants_schema=variants_schema)
Example #24
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--verbose', '-V', action='count', default=0)

    FamiliesLoader.cli_arguments(parser)
    parser.add_argument(
        "-o",
        "--output",
        dest="output_filename",
        help="output families parquet filename "
        "(default is [basename(families_filename).parquet])",
    )
    parser.add_argument(
        "--partition-description",
        "--pd",
        help="input partition description filename",
    )
    parser.add_argument(
        "--study-id",
        type=str,
        default=None,
        dest="study_id",
        metavar="<study id>",
        help="Study ID. "
        "If none specified, the basename of families filename is used to "
        "construct study id [default: basename(families filename)]",
    )
    argv = parser.parse_args(argv)
    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    filename, params = FamiliesLoader.parse_cli_arguments(argv)
    if argv.study_id is not None:
        study_id = argv.study_id
    else:
        study_id, _ = os.path.splitext(os.path.basename(filename))

    loader = FamiliesLoader(filename, **params)
    families = loader.load()

    if argv.partition_description:
        partition_description = ParquetPartitionDescriptor.from_config(
            argv.partition_description
        )
        if partition_description.family_bin_size > 0:
            families = partition_description \
                .add_family_bins_to_families(families)

    if not argv.output_filename:
        output_filename, _ = os.path.splitext(os.path.basename(filename))
        output_filename = f"{output_filename}.parquet"
    else:
        output_filename = argv.output_filename

    ParquetManager.families_to_parquet(families, output_filename)
Example #25
0
def main(argv, gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    parser = argparse.ArgumentParser()
    parser.add_argument('--verbose', '-V', action='count', default=0)

    FamiliesLoader.cli_arguments(parser)
    VcfLoader.cli_arguments(parser, options_only=True)

    parser.add_argument(
        "-o",
        "--output",
        dest="output_filename",
        help="output families parquet filename "
        "(default is [basename(families_filename).ped])",
    )
    parser.add_argument(
        "--partition-description",
        "--pd",
        help="input partition description filename",
    )
    parser.add_argument(
        "--vcf-files",
        type=str,
        nargs="+",
        metavar="<VCF filename>",
        help="VCF file to import",
    )

    argv = parser.parse_args(argv)
    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    filename, params = FamiliesLoader.parse_cli_arguments(argv)
    logger.info(F"PED PARAMS: {params}")

    loader = FamiliesLoader(filename, **params)
    families = loader.load()

    if argv.partition_description:
        partition_description = ParquetPartitionDescriptor.from_config(
            argv.partition_description)
        families = partition_description.add_family_bins_to_families(families)

    variants_filenames, variants_params = \
        VcfLoader.parse_cli_arguments(argv)

    if variants_filenames:
        assert variants_filenames is not None

        variants_loader = VcfLoader(
            families,
            variants_filenames,
            params=variants_params,
            genome=gpf_instance.genomes_db.get_genome(),
        )

        families = variants_loader.families

    if families.broken_families:
        for family_id, family in families.broken_families.items():
            if not family.has_members():
                del families[family_id]
                logger.warning(
                    f"family {family_id} does not contain sequenced members "
                    f"and is removed from the pedigree: {family}")

    if not argv.output_filename:
        output_filename, _ = os.path.splitext(os.path.basename(filename))
        output_filename = f"{output_filename}.ped"
    else:
        output_filename = argv.output_filename

    FamiliesLoader.save_pedigree(families, output_filename)