Beispiel #1
0
def cnv_impala(
        request,
        cnv_loader,
        genomes_db_2013,
        hdfs_host,
        impala_host,
        impala_genotype_storage,
        reimport,
        cleanup,
        data_import):

    from dae.backends.impala.impala_helpers import ImpalaHelpers

    impala_helpers = ImpalaHelpers(
        impala_hosts=[impala_host], impala_port=21050)

    study_id = "cnv_test"

    (variant_table, pedigree_table) = \
        impala_genotype_storage.study_tables(
            FrozenBox({"id": study_id}))

    if reimport or \
            not impala_helpers.check_table(
                "impala_test_db", variant_table) or \
            not impala_helpers.check_table(
                "impala_test_db", pedigree_table):

        from dae.backends.impala.hdfs_helpers import HdfsHelpers

        hdfs = HdfsHelpers(hdfs_host, 8020)

        temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data")
        hdfs.mkdir(temp_dirname)

        families_loader, variants_loader = cnv_loader
        impala_genotype_storage.simple_study_import(
            study_id,
            families_loader=families_loader,
            variant_loaders=[variants_loader],
            output=temp_dirname,
            include_reference=True)

    fvars = impala_genotype_storage.build_backend(
        FrozenBox({"id": study_id}), genomes_db_2013
    )

    return fvars
Beispiel #2
0
 def builder(path):
     study_id = os.path.basename(path)
     fvars = impala_genotype_storage.build_backend(
         FrozenBox({"id": study_id}),
         genomes_db_2013,
     )
     return fvars
Beispiel #3
0
def test_get_gene_weights(mocker):

    dummy_enrichment_config = FrozenBox(
        {"testWeight": "mock value"}
    )

    kwargs = {
        "geneWeights": {
            "weight": "testWeight",
            "rangeStart": 12,
            "rangeEnd": 34,
        }
    }

    mocker.patch.object(
        GeneWeight,
        "__init__",
        return_value=None
    )
    mocker.patch.object(
        GeneWeight,
        "get_genes",
        return_value="mock_return_value"
    )

    assert GeneSymsMixin.get_gene_weights(
        dummy_enrichment_config,
        **kwargs
    ) == "mock_return_value"
Beispiel #4
0
    def __init__(self, study_id, rest_client):
        self._remote_study_id = study_id
        self.rest_client = rest_client

        config = self.rest_client.get_dataset_config(self._remote_study_id)
        config["id"] = self.rest_client.prefix_remote_identifier(study_id)
        config["name"] = self.rest_client.prefix_remote_name(
            config.get("name", self._remote_study_id))

        if config["parents"]:
            config["parents"] = list(
                map(self.rest_client.prefix_remote_identifier,
                    config["parents"]))
            self._parents = list(config["parents"])

        self._study_ids = []
        if config.get("studies"):
            config["studies"] = list(
                map(self.rest_client.prefix_remote_identifier,
                    config["studies"]))
            self._study_ids = config["studies"]

        super().__init__(FrozenBox(config), [self])

        self.is_remote = True

        self._families = None

        self._build_families()

        self._person_set_collections = None

        self._build_person_set_collections()
Beispiel #5
0
def iossifov_2014_local_config(remote_studies_dir, remote_dir):
    study_dir = remote_studies_dir.joinpath("iossifov_2014")
    filename = study_dir.joinpath("iossifov_2014.conf")
    file_content = ""

    with open(filename, "r") as infile:
        file_content = infile.read()

    config = toml.loads(file_content)

    files = config["genotype_storage"]["files"]
    files["pedigree"]["path"] = str(
        study_dir.joinpath("data", "IossifovWE2014.ped")
    )
    files["variants"][0]["path"] = str(
        study_dir.joinpath("data", "IossifovWE2014.tsv")
    )

    default_config_filename = remote_dir.joinpath("defaultConfiguration.conf")
    with open(default_config_filename, "r") as infile:
        file_content = infile.read()

    default_config = toml.loads(file_content)

    config = recursive_dict_update(default_config, config)

    return FrozenBox(config)
Beispiel #6
0
def test_tabix_region_strictness():
    # long_variant.vcf.gz has 6 variants before
    # the region 4:47788570 and 1 that is before it,
    # but overlaps it due to its length. We wish to omit
    # all 7 variants.

    filename = relative_to_this_test_folder("fixtures/long_variant.vcf.gz")

    options = FrozenBox(
        {
            "vcf": True,
            "c": "CHROM",
            "p": "POS",
            "r": "REF",
            "a": "ALT",
            "region": "4:47788570",
        }
    )

    with TSVGzipReader(options, filename) as reader:
        assert reader is not None

        all_line_count = 0
        for _line in reader.lines_read_iterator():
            all_line_count += 1

    with TabixReaderVariants(options, filename) as reader:
        assert reader is not None

        count = 0
        for _line in reader.lines_read_iterator():
            print(_line)
            count += 1
        assert (all_line_count - count) == 7
Beispiel #7
0
    def read_and_parse_file_configuration(cls, options, config_file):

        config = GPFConfigParser.load_config(
            config_file, annotation_conf_schema
        ).to_dict()

        config["options"] = options
        config["columns"] = {}
        config["native_columns"] = []
        config["virtual_columns"] = []
        config["output_columns"] = []

        config = cls._setup_defaults(DefaultBox(config))

        parsed_sections = list()
        for config_section in config.sections:
            if config_section.annotator is None:
                continue
            config_section_dict = recursive_dict_update(
                {"options": options}, config_section.to_dict()
            )
            parsed_sections.append(cls.parse_section(config_section_dict))

        config["sections"] = parsed_sections

        return FrozenBox(config)
Beispiel #8
0
    def build(options, config_file, genomes_db):
        pipeline_config = \
            AnnotationConfigParser.read_and_parse_file_configuration(
                options, config_file
            )
        assert pipeline_config.sections

        pipeline = PipelineAnnotator(pipeline_config, genomes_db)
        output_columns = list(pipeline.config.output_columns)
        for section_config in pipeline_config.sections:
            annotator = AnnotatorFactory.make_annotator(
                section_config, genomes_db)
            pipeline.add_annotator(annotator)
            output_columns.extend([
                col for col in annotator.config.output_columns
                if col not in output_columns
            ])

        # FIXME
        # The lines below are a hack to allow modification
        # of the "output_columns" key in an otherwise frozen Box
        # This should be fixed properly when the annotation pipeline
        # module is refactored
        pipeline_config = pipeline.config.to_dict()
        pipeline_config["output_columns"] = output_columns
        pipeline.config = FrozenBox(pipeline_config)

        return pipeline
Beispiel #9
0
def build_families_report(families):
    status_collection_config = {
        "id":
        "status",
        "name":
        "Affected status",
        "domain": [{
            "id": "affected",
            "name": "affected",
            "values": ["affected"],
            "color": "#e35252"
        }, {
            "id": "unaffected",
            "name": "unaffected",
            "values": ["unaffected"],
            "color": "#ffffff"
        }],
        "default": {
            "id": "unspecified",
            "name": "unspecified",
            "color": "#aaaaaa"
        },
        "sources": [{
            "from": "pedigree",
            "column": "status"
        }]
    }
    status_collection_config = FrozenBox(status_collection_config)
    status_collection = PersonSetCollection.from_families(
        status_collection_config, families)
    return FamiliesReport(families, [status_collection])
Beispiel #10
0
def phenotype_person_sets(variants_impl):
    vvars = variants_impl("variants_impala")("backends/a")
    families = vvars.families
    person_sets_config = FrozenBox({
        "id":
        "phenotype",
        "sources": [{
            "from": "pedigree",
            "source": "status",
        }],
        "default": {
            "id": "unknown",
            "name": "Unknown",
            "color": "#aaaaaa",
        },
        "domain": [{
            "id": "autism",
            "name": "Autism",
            "values": ["affected"],
            "color": "#ff0000"
        }, {
            "id": "unaffected",
            "name": "Unaffected",
            "values": ["unaffected"],
            "color": "#0000ff",
        }]
    })
    person_sets = PersonSetCollection.from_families(person_sets_config,
                                                    families)
    assert person_sets is not None
    return person_sets
Beispiel #11
0
def loadNCBIGeneInfo(config):
    genes, ns_tokens = _parseNCBIGeneInfo(config.gene_info)
    config = config.to_dict()
    config.setdefault("gene_info", dict())
    config["gene_info"]["genes"] = genes
    config["gene_info"]["ns_tokens"] = ns_tokens
    config = FrozenBox(config)
    return config
Beispiel #12
0
def vcf_io(request):
    io_config = {
        "infile": relative_to_this_test_folder("fixtures/vcf_input.tsv"),
        "outfile": "-",
    }
    io_config = FrozenBox(io_config)
    io_manager = IOManager(io_config, IOType.TSV, IOType.TSV)
    return io_manager
Beispiel #13
0
 def build(fixture_name, io_options=dict()):
     io_config = {
         "infile": relative_to_this_test_folder(fixture_name),
         "outfile": "-",
     }
     io_options.update(io_config)
     io_options = FrozenBox(io_options)
     io_manager = IOManager(io_options, IOType.TSV, IOType.TSV)
     return io_manager
Beispiel #14
0
def variants_io_m(request):
    io_config = FrozenBox({
        "infile":
        relative_to_this_test_folder("fixtures/input_multi.tsv"),
        "outfile":
        "-",
    })
    io_manager = IOManager(io_config, IOType.TSV, IOType.TSV)
    return io_manager
Beispiel #15
0
def test_tabix_reader_header(filename):
    filename = relative_to_this_test_folder(filename)

    options = FrozenBox({})

    with TabixReaderVariants(options, filename) as reader:
        assert reader is not None
        assert reader.schema.col_names is not None

        assert len(reader.schema.col_names) == 4
Beispiel #16
0
def from_prefix_denovo(prefix):
    denovo_filename = "{}.txt".format(prefix)
    family_filename = "{}_families.ped".format(prefix)

    conf = {
        "denovo": {
            "denovo_filename": denovo_filename,
            "family_filename": family_filename,
        }
    }
    return FrozenBox(conf)
Beispiel #17
0
def iossifov2014_impala(request, iossifov2014_loader, genomes_db_2013,
                        hdfs_host, impala_host, impala_genotype_storage,
                        reimport):

    study_id = "iossifov_we2014_test"

    from dae.backends.impala.impala_helpers import ImpalaHelpers

    impala_helpers = ImpalaHelpers(impala_hosts=[impala_host],
                                   impala_port=21050)

    (variant_table, pedigree_table) = \
        impala_genotype_storage.study_tables(
            FrozenBox({"id": study_id}))

    if reimport or \
            not impala_helpers.check_table(
                    "impala_test_db", variant_table) or \
            not impala_helpers.check_table(
                    "impala_test_db", pedigree_table):

        from dae.backends.impala.hdfs_helpers import HdfsHelpers

        hdfs = HdfsHelpers(hdfs_host, 8020)

        temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data")
        hdfs.mkdir(temp_dirname)

        study_temp_dirname = os.path.join(temp_dirname, study_id)
        variants_loader, families_loader = iossifov2014_loader

        impala_genotype_storage.simple_study_import(
            study_id,
            families_loader=families_loader,
            variant_loaders=[variants_loader],
            output=study_temp_dirname)

    fvars = impala_genotype_storage.build_backend(FrozenBox({"id": study_id}),
                                                  genomes_db_2013)
    return fvars
Beispiel #18
0
def from_prefix_dae(prefix):
    summary_filename = "{}.txt.gz".format(prefix)
    toomany_filename = "{}-TOOMANY.txt.gz".format(prefix)
    family_filename = "{}.families.txt".format(prefix)

    conf = {
        "dae": {
            "summary_filename": summary_filename,
            "toomany_filename": toomany_filename,
            "family_filename": family_filename,
        }
    }
    return FrozenBox(conf)
Beispiel #19
0
def test_tsv_reader(filename, header, linecount):
    infilename = relative_to_this_test_folder(filename)
    os.path.exists(infilename)

    options = FrozenBox({"region": None, "no_header": None})

    with TSVReader(options, filename=infilename) as reader:
        assert reader is not None
        print(reader.schema.col_names)
        assert reader.schema.col_names == header

        for line in reader.lines_read_iterator():
            print(line)
        assert reader.linecount == linecount
Beispiel #20
0
def test_create_file_io():
    io_config = FrozenBox({
        "infile":
        relative_to_this_test_folder("fixtures/input.tsv"),
        "outfile":
        "-",
    })
    with IOManager(io_config, IOType.TSV, IOType.TSV) as io:
        assert io is not None
        lines = list(io.lines_read_iterator())
        print(lines)
        assert len(lines) == 4
        print(io.header)
        assert len(io.header) == 3
Beispiel #21
0
def test_get_gene_weights_query():
    dummy_enrichment_config = FrozenBox(
        {"testWeight": "mock value"}
    )
    kwargs = {
        "geneWeights": {
            "weight": "testWeight",
            "rangeStart": 12,
            "rangeEnd": 34,
        }
    }
    assert GeneSymsMixin.get_gene_weights_query(
        dummy_enrichment_config,
        **kwargs
    ) == ("testWeight", 12, 34)
Beispiel #22
0
def from_prefix_vcf(prefix):
    pedigree_filename = f"{prefix}.ped"
    assert os.path.exists(pedigree_filename)
    conf = {
        "prefix": prefix,
        "pedigree": pedigree_filename,
    }

    vcf_filename = "{}.vcf".format(prefix)
    if not os.path.exists(vcf_filename):
        vcf_filename = "{}.vcf.gz".format(prefix)
    if os.path.exists(vcf_filename):
        conf["vcf"] = vcf_filename

    denovo_filename = f"{prefix}.tsv"
    if os.path.exists(denovo_filename):
        conf["denovo"] = denovo_filename
    return FrozenBox(conf)
Beispiel #23
0
    def annotate_file(self, file_io_manager):
        """
            Method for annotating file from `Annotator`.
        """
        self.schema = deepcopy(file_io_manager.reader.schema)
        self.collect_annotator_schema(self.schema)

        file_io_manager.writer.schema = self.schema

        line_mapper = LineMapper(file_io_manager.header)
        if self.mode == "replace":
            output_columns = [
                col for col in self.schema.columns
                if col not in self.config.virtual_columns
            ]

            # FIXME
            # Using this hack to change the output_columns
            # since the FrozenBox instances in "sections"
            # don't allow changing attributes via the standard
            # way with the usage of recusrive_dict_update
            self.config = self.config.to_dict()
            self.config["output_columns"] = output_columns
            self.config = FrozenBox(self.config)

        file_io_manager.header_write(self.config.output_columns)

        for line in file_io_manager.lines_read_iterator():
            # TODO How will additional headers behave
            # with column type support (and coercion)?
            if "#" in line[0]:
                file_io_manager.line_write(line)
                continue
            annotation_line = line_mapper.map(line)

            try:
                self.line_annotation(annotation_line)
            except Exception as ex:
                logger.error(f"problems annotating line: {line}")
                logger.error(f"{annotation_line}")
                logger.error(f"{ex}")
                traceback.print_exc(file=sys.stderr)

            file_io_manager.line_write(self.build_output_line(annotation_line))
Beispiel #24
0
def impala_genotype_storage(hdfs_host, impala_host):
    storage_config = FrozenBox({
        "id": "impala_test_storage",
        "type": "impala",
        "dir": "/tmp",
        "impala": {
            "hosts": [impala_host],
            "port": 21050,
            "db": impala_test_dbname(),
            "pool_size": 5,
        },
        "hdfs": {
            "host": hdfs_host,
            "port": 8020,
            "base_dir": "/tmp/test_data"
        },
    })

    return ImpalaGenotypeStorage(storage_config, "impala_test_storage")
Beispiel #25
0
def extra_attrs_impala(
        request,
        denovo_extra_attr_loader,
        genomes_db_2013,
        hdfs_host,
        impala_genotype_storage):

    from dae.backends.impala.hdfs_helpers import HdfsHelpers

    hdfs = HdfsHelpers(hdfs_host, 8020)

    temp_dirname = hdfs.tempdir(prefix="variants_", suffix="_data")
    hdfs.mkdir(temp_dirname)

    study_id = "denovo_extra_attrs"
    parquet_filenames = ParquetManager.build_parquet_filenames(
        temp_dirname, bucket_index=2, study_id=study_id
    )

    assert parquet_filenames is not None

    ParquetManager.families_to_parquet(
        denovo_extra_attr_loader.families, parquet_filenames.pedigree
    )

    variants_dir = os.path.join(temp_dirname, "variants")
    partition_description = NoPartitionDescriptor(variants_dir)

    ParquetManager.variants_to_parquet(
        denovo_extra_attr_loader, partition_description
    )

    impala_genotype_storage.impala_load_dataset(
        study_id,
        variants_dir=os.path.dirname(parquet_filenames.variants),
        pedigree_file=parquet_filenames.pedigree,
    )

    fvars = impala_genotype_storage.build_backend(
        FrozenBox({"id": study_id}), genomes_db_2013
    )

    return fvars
Beispiel #26
0
def test_datasets_api_get_all_with_selected_restriction(
        admin_client, wdae_gpf_instance):

    # FIXME This is a temporary hack to mock the
    # dae_config of wdae_gpf_instance since using the mocker
    # fixture does not work.
    old_conf = wdae_gpf_instance.dae_config
    edited_conf = old_conf.to_dict()
    edited_conf["gpfjs"]["selected_genotype_data"] = [
        "quads_f1", "quads_f2", "f1_group"
    ]
    wdae_gpf_instance.dae_config = FrozenBox(edited_conf)

    try:
        response = admin_client.get("/api/v3/datasets")
        assert response
        assert response.status_code == 200
        assert len(response.data["data"]) == 3
    finally:
        wdae_gpf_instance.dae_config = old_conf
Beispiel #27
0
def test_tabix_chrom_prefix(
    filename, has_prefix, region, total_count, check_region
):
    filename = relative_to_this_test_folder(filename)

    options = FrozenBox({"region": region})

    with TabixReaderVariants(options, filename) as reader:
        assert reader is not None
        assert reader.schema.col_names is not None

        assert reader._has_chrom_prefix == has_prefix
        assert (
            handle_chrom_prefix(reader._has_chrom_prefix, region)
            == check_region
        )

        count = 0
        for _line in reader.lines_read_iterator():
            count += 1
        assert count == total_count
Beispiel #28
0
def test_tabix_reader_simple():
    filename, header, region, linecount = (
        "fixtures/input3.tsv.gz",
        ["CHROM", "POS", "REF", "ALT"],
        None,
        20,
    )

    infilename = relative_to_this_test_folder(filename)
    os.path.exists(infilename)

    options = FrozenBox({"region": region})

    with TabixReaderVariants(options, filename=infilename) as reader:
        assert reader is not None
        print(reader.schema.col_names)
        assert reader.schema.col_names == header

        for line in reader.lines_read_iterator():
            print(line)
        assert reader.linecount == linecount
Beispiel #29
0
def test_frozen_box():
    with pytest.raises(BoxError):
        frozen_box = FrozenBox({"a": 123})
        assert frozen_box.a == 123
        frozen_box.a = 456
Beispiel #30
0
    def build(dirname):

        if not impala_helpers.check_database(impala_test_dbname()):
            impala_helpers.create_database(impala_test_dbname())

        vcfdirname = relative_to_this_test_folder(
            os.path.join("fixtures", dirname))
        vcf_configs = collect_vcf(vcfdirname)

        for config in vcf_configs:
            logger.debug(f"importing: {config}")

            filename = os.path.basename(config.pedigree)
            study_id = os.path.splitext(filename)[0]

            (variant_table, pedigree_table) = \
                impala_genotype_storage.study_tables(
                    FrozenBox({"id": study_id}))

            if (not reimport and impala_helpers.check_table(
                    impala_test_dbname(), variant_table)
                    and impala_helpers.check_table(impala_test_dbname(),
                                                   pedigree_table)):
                continue

            study_id = study_id_from_path(config.pedigree)
            study_temp_dirname = os.path.join(temp_dirname, study_id)

            families_loader = FamiliesLoader(config.pedigree)
            families = families_loader.load()
            genome = gpf_instance_2013.genomes_db.get_genome()

            loaders = []
            if config.denovo:
                denovo_loader = DenovoLoader(families,
                                             config.denovo,
                                             genome,
                                             params={
                                                 "denovo_genotype": "genotype",
                                                 "denovo_family_id": "family",
                                                 "denovo_chrom": "chrom",
                                                 "denovo_pos": "pos",
                                                 "denovo_ref": "ref",
                                                 "denovo_alt": "alt",
                                             })
                loaders.append(
                    AnnotationPipelineDecorator(denovo_loader,
                                                annotation_pipeline))

            vcf_loader = VcfLoader(
                families,
                [config.vcf],
                genome,
                regions=None,
                params={
                    "vcf_include_reference_genotypes": True,
                    "vcf_include_unknown_family_genotypes": True,
                    "vcf_include_unknown_person_genotypes": True,
                    "vcf_multi_loader_fill_in_mode": "reference",
                    "vcf_denovo_mode": "denovo",
                    "vcf_omission_mode": "omission",
                },
            )

            loaders.append(
                AnnotationPipelineDecorator(vcf_loader, annotation_pipeline))

            impala_genotype_storage.simple_study_import(
                study_id,
                families_loader=families_loader,
                variant_loaders=loaders,
                output=study_temp_dirname,
                include_reference=True)