def main(gpf_instance=None, argv=None):
    description = "Generate genovo gene sets tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', action='count', default=0)

    parser.add_argument(
        "--show-studies",
        help="This option will print available "
        "genotype studies and groups names",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--studies",
        help="Specify genotype studies and groups "
        "names for generating denovo gene sets. Default to all.",
        default=None,
        action="store",
    )

    args = parser.parse_args(argv)

    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    if gpf_instance is None:
        gpf_instance = GPFInstance()
    denovo_gene_sets_db = gpf_instance.denovo_gene_sets_db

    if args.show_studies:
        for study_id in denovo_gene_sets_db.get_genotype_data_ids():
            print(study_id)
    else:
        if args.studies:
            filter_studies_ids = None
            studies = args.studies.split(",")
        else:
            studies = gpf_instance.get_genotype_data_ids()

        print("generating de Novo gene sets for studies:", studies)
        filter_studies_ids = [
            study_id
            for study_id in denovo_gene_sets_db.get_genotype_data_ids()
            if study_id in studies
        ]
        denovo_gene_sets_db._build_cache(filter_studies_ids)
Esempio n. 2
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage
    )
    if not genotype_storage or (
            genotype_storage and not genotype_storage.is_impala()):
        print("missing or non-impala genotype storage")
        return

    assert os.path.exists(argv.variants)

    study_config = genotype_storage.impala_load_dataset(
        argv.study_id, argv.variants, argv.pedigree)

    if argv.study_config:
        input_config = GPFConfigParser.load_config_raw(argv.study_config)
        study_config = recursive_dict_update(study_config, input_config)

    study_config = StudyConfigBuilder(study_config).build_config()
    assert study_config is not None
    save_study_config(
        gpf_instance.dae_config, argv.study_id, study_config,
        force=argv.force)
Esempio n. 3
0
def main(gpf_instance=None, argv=None):
    description = "Generate common reports tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', action='count', default=0)

    parser.add_argument(
        "--show-studies",
        help="This option will print available "
        "genotype studies and groups names",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--studies",
        help="Specify genotype studies and groups "
        "names for generating common report. Default to all query objects.",
        default=None,
        action="store",
    )

    args = parser.parse_args(argv)
    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    logging.getLogger("impala").setLevel(logging.WARNING)

    start = time.time()
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    common_report_facade = gpf_instance._common_report_facade

    if args.show_studies:
        for study_id in common_report_facade.get_all_common_report_ids():
            logger.warning(f"study: {study_id}")
    else:
        elapsed = time.time() - start
        logger.info(
            f"started common reports generation after {elapsed:0.2f} sec")
        if args.studies:
            studies = args.studies.split(",")
            logger.info(f"generating common reports for: {studies}")
            common_report_facade.generate_common_reports(studies)
        else:
            logger.info("generating common reports for all studies!!!")
            common_report_facade.generate_all_common_reports()
Esempio n. 4
0
    def _create_local_enrichment_builder(self, dataset_id, background_name,
                                         counting_name, gene_syms):
        dataset = self.get_genotype_data(dataset_id)
        enrichment_config = GPFInstance.get_study_enrichment_config(
            self, dataset_id)
        if enrichment_config is None:
            return None
        enrichment_tool = self.get_enrichment_tool(enrichment_config,
                                                   dataset_id, background_name,
                                                   counting_name)
        if enrichment_tool.background is None:
            return None

        builder = EnrichmentBuilder(dataset, enrichment_tool, gene_syms)
        return builder
Esempio n. 5
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)
    if not genotype_storage or (genotype_storage
                                and not genotype_storage.is_impala()):
        logger.error("missing or non-impala genotype storage")
        return

    partition_descriptor = None
    if argv.variants and os.path.exists(argv.variants):
        partition_config_file = os.path.join(argv.variants,
                                             "_PARTITION_DESCRIPTION")

        if os.path.isdir(argv.variants) and \
                os.path.exists(partition_config_file):
            partition_descriptor = ParquetPartitionDescriptor.from_config(
                partition_config_file, root_dirname=argv.variants)

    if partition_descriptor is None:
        partition_descriptor = NoPartitionDescriptor(
            root_dirname=argv.variants)

    genotype_storage.hdfs_upload_dataset(argv.study_id, argv.variants,
                                         argv.pedigree, partition_descriptor)
Esempio n. 6
0
def main(argv):

    try:
        # Setup argument parser

        gpf_instance = GPFInstance()
        dae_conf = gpf_instance.dae_config

        parser = pheno_cli_parser()
        args = parser.parse_args(argv)
        if args.instruments is None:
            print("missing instruments directory parameter", sys.stderr)
            raise ValueError()
        if args.pedigree is None:
            print("missing pedigree filename", sys.stderr)
            raise ValueError()
        if args.pheno_name is None:
            print("missing pheno db name", sys.stderr)
            raise ValueError()

        args.pheno_name = verify_phenotype_data_name(args.pheno_name)

        pheno_db_dir = os.path.join(dae_conf.phenotype_data.dir,
                                    args.pheno_name)
        if not os.path.exists(pheno_db_dir):
            os.makedirs(pheno_db_dir)

        args.pheno_db_filename = os.path.join(pheno_db_dir,
                                              "{}.db".format(args.pheno_name))
        if os.path.exists(args.pheno_db_filename):
            if not args.force:
                print("pheno db filename already exists:",
                      args.pheno_db_filename)
                raise ValueError()
            else:
                os.remove(args.pheno_db_filename)

        args.browser_dir = os.path.join(pheno_db_dir, "browser")
        if not os.path.exists(args.browser_dir):
            os.makedirs(args.browser_dir)

        config = parse_phenotype_data_config(args)
        if args.regression:
            regressions = GPFConfigParser.load_config(args.regression,
                                                      regression_conf_schema)
        else:
            regressions = None

        prep = PrepareVariables(config)
        prep.build_pedigree(args.pedigree)
        prep.build_variables(args.instruments, args.data_dictionary)

        build_pheno_browser(
            args.pheno_db_filename,
            args.pheno_name,
            args.browser_dir,
            regressions,
        )

        pheno_conf_path = os.path.join(pheno_db_dir,
                                       "{}.conf".format(args.pheno_name))

        with open(pheno_conf_path, "w") as pheno_conf_file:
            pheno_conf_file.write(
                toml.dumps(generate_phenotype_data_config(args, regressions)))

        return 0
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        traceback.print_exc()

        program_name = "simple_pheno_import.py"
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2
Esempio n. 7
0
def main(gpf_instance=None, argv=None):
    description = "Generate autism gene profile statistics tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', '-v', action='count', default=0)
    default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb")
    parser.add_argument("--dbfile", default=default_dbfile)
    parser.add_argument(
        "--gene-sets-genes",
        action="store_true",
        help="Generate AGPs only for genes contained in the config's gene sets"
    )
    parser.add_argument(
        "--genes",
        help="Comma separated list of genes to generate statistics for")
    parser.add_argument("--drop", action="store_true")

    args = parser.parse_args(argv)
    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)
    logging.getLogger("impala").setLevel(logging.WARNING)

    start = time.time()
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    config = gpf_instance._autism_gene_profile_config

    # gpf_instance.gene_sets_db.get_all_gene_sets("main")

    collections_gene_sets = []

    for gs_category in config.gene_sets:
        for gs in gs_category.sets:
            gs_id = gs["set_id"]
            collection_id = gs["collection_id"]

            collections_gene_sets.append(
                (collection_id,
                 gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id)))

    # collections_gene_sets = []
    # for name in config.gene_sets:
    #     gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name)
    #     collections_gene_sets.append(gene_set)
    logger.info(f"collected gene sets: {len(collections_gene_sets)}")

    # gene_sets = list(
    #     filter(lambda gs: gs["name"] in config.gene_sets, gene_sets)
    # )
    gene_symbols = set()
    if args.genes:
        gene_symbols = [gs.strip() for gs in args.genes.split(",")]
        gene_symbols = set(gene_symbols)
    elif args.gene_sets_genes:
        for _, gs in collections_gene_sets:
            gene_symbols = gene_symbols.union(gs["syms"])
    else:
        gene_models = gpf_instance.get_genome().get_gene_models().gene_models
        gene_symbols = set(gene_models.keys())
    gs_count = len(gene_symbols)
    logger.info(f"Collected {gs_count} gene symbols")
    has_denovo = False
    has_rare = False
    person_ids = dict()
    for dataset_id, filters in config.datasets.items():
        genotype_data = gpf_instance.get_genotype_data(dataset_id)
        assert genotype_data is not None, dataset_id
        person_ids[dataset_id] = dict()
        for ps in filters.person_sets:
            person_set_query = (ps.collection_name, [ps.set_name])
            person_ids[dataset_id][ps.set_name] = \
                genotype_data._transform_person_set_collection_query(
                    person_set_query, None
                )
        for stat in filters.statistics:
            if stat.category == "denovo":
                has_denovo = True
            elif stat.category == "rare":
                has_rare = True

    agps = dict()
    gene_symbols = list(gene_symbols)
    gs_count = len(gene_symbols)
    elapsed = time.time() - start
    logger.info(f"data collected: {elapsed:.2f} secs")

    start = time.time()
    for idx, sym in enumerate(gene_symbols, 1):
        gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets)
        agps[gs] = agp
        if idx % 25 == 0:
            elapsed = time.time() - start
            logger.info(f"Generated {idx}/{gs_count} AGP statistics "
                        f"{elapsed:.2f} secs")

    logger.info("Done generating AGP statistics!")
    generate_end = time.time()
    elapsed = generate_end - start
    logger.info(f"Took {elapsed:.2f} secs")

    if has_denovo:
        logger.info("Collecting denovo variants")
        denovo_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            denovo_variants[dataset_id] = list(
                genotype_data.query_variants(genes=genes,
                                             inheritance="denovo"))
        logger.info("Done collecting denovo variants")
        logger.info("Counting denovo variants...")
        fill_variant_counts(denovo_variants, agps, config, person_ids, True)
        logger.info("Done counting denovo variants")

    if has_rare:
        logger.info("Collecting rare variants")
        rare_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            rare_variants[dataset_id] = []
            for statistic in filters.statistics:
                if statistic.category == "denovo":
                    continue
                kwargs = dict()
                kwargs["roles"] = "prb or sib"

                if statistic.effects is not None:
                    kwargs["effect_types"] = \
                        expand_effect_types(statistic.effects)

                if statistic.variant_types:
                    variant_types = [
                        VariantType.from_name(statistic.variant_types).repr()
                    ]
                    kwargs["variant_type"] = " or ".join(variant_types)

                if statistic.scores:
                    scores = []
                    for score in statistic.scores:
                        min_max = (score.min, score.max)
                        score_filter = (score.name, min_max)
                        scores.append(score_filter)
                    kwargs["real_attr_filter"] = scores

                if statistic.variant_types:
                    roles = [Role.from_name(statistic.roles).repr()]
                    kwargs["roles"] = " or ".join(roles)

                rare_variants[dataset_id].extend(
                    list(
                        genotype_data.query_variants(
                            genes=genes,
                            inheritance=[
                                "not denovo and "
                                "not possible_denovo and not possible_omission",
                                "mendelian or missing"
                            ],
                            frequency_filter=[("af_allele_freq", (None, 1.0))],
                            **kwargs)))
        logger.info("Done collecting rare variants")
        logger.info("Counting rare variants...")
        fill_variant_counts(rare_variants, agps, config, person_ids, False)
        logger.info("Done counting rare variants")

    logger.info("Calculating rates...")
    calculate_rates(gpf_instance, agps, config)
    logger.info("Done calculating rates")
    elapsed = time.time() - generate_end
    logger.info(f"Took {elapsed:.2f} secs")

    agpdb = AutismGeneProfileDB(
        gpf_instance._autism_gene_profile_config.to_dict(),
        args.dbfile,
        clear=True)

    agpdb.clear_all_tables()
    agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids())
    logger.info("Inserting statistics into DB")
    agpdb.insert_agps(agps.values())
    logger.info("Building AGP output view")
    agpdb.build_agp_view()
    logger.info("Generating cache table")
    agpdb.generate_cache_table()
    logger.info("Done")
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    if argv.studies is None:
        study_ids = [
            gd.study_id for gd in gpf_instance.get_all_genotype_data()
            if not gd.is_group
        ]
    else:
        study_ids = [sid.strip() for sid in argv.studies.split(",")]

    logger.info(f"building summary variants tables for studies: {study_ids}")

    for study_id in study_ids:
        study = gpf_instance.get_genotype_data(study_id)
        assert study.study_id == study_id

        study_backend = study._backend
        if not isinstance(study_backend, ImpalaVariants):
            logger.warning(f"not an impala study: {study_id}; skipping...")
            continue

        if study_backend.variants_table is None:
            logger.warning(f"study {study_id} has no variants; skipping...")
            continue

        drop_summary_table(study_id, study_backend)
        partitions = create_summary_table(study_id, study_backend)

        summary_schema = collect_summary_schema(study_backend)
        summary_table = summary_table_name_temp(study_id, study_backend)
        pedigree_table = f"{study_backend.db}.{study_backend.pedigree_table}"
        variants_table = f"{study_backend.db}.{study_backend.variants_table}"

        partition_bins = {}

        logger.info(f"collecting partitions {partitions} from "
                    f"variants table {variants_table}")

        for partition in partitions:
            partition_bins[partition] = variants_parition_bins(
                study_backend, partition)

        logger.info(f"variant table partitions: {partition_bins}")

        impala = study_backend._impala_helpers
        started = time.time()

        region_bin_helpers = RegionBinsHelper(study_backend.table_properties,
                                              gpf_instance.get_genome())
        region_bin_helpers._build_region_bins()

        logger.info(
            f"region bins calculated: {region_bin_helpers.region_bins}")

        assert set(partition_bins["region_bin"]).issubset(
            set(region_bin_helpers.region_bins.keys()))

        all_partitions = list(itertools.product(*partition_bins.values()))
        for index, partition in enumerate(all_partitions):
            partition = {
                key: value
                for key, value in zip(partition_bins.keys(), partition)
            }
            logger.info(f"building summary table for partition: "
                        f"{index}/{len(all_partitions)}; "
                        f"{partition} of {study_id}")

            part_started = time.time()
            for q in insert_into_summary_table(pedigree_table, variants_table,
                                               summary_table, summary_schema,
                                               partition,
                                               region_bin_helpers.region_bins,
                                               argv.split_size):
                repeat = 10
                while repeat > 0:
                    try:
                        with closing(impala.connection()) as connection:
                            with connection.cursor() as cursor:
                                logger.debug(
                                    f"going to run summary query: {q}")
                                cursor.execute(q)
                                break
                    except Exception as ex:
                        logger.exception(f"error executing {q}")
                        time.sleep(6)
                        repeat -= 1
                        if repeat == 0:
                            raise ex

            part_elapsed = time.time() - part_started

            logger.info(f"processing partition "
                        f"{index}/{len(all_partitions)} of {study_id} "
                        f"took {part_elapsed:.2f} secs; "
                        f"{partition} ")
            elapsed = time.time() - started
            logger.info(f"processing partition "
                        f"{index}/{len(all_partitions)} of {study_id}; "
                        f"total time {elapsed:.2f} secs")

        rename_summary_table(study_id, study_backend)
Esempio n. 9
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    genotype_storage_db = gpf_instance.genotype_storage_db
    genotype_storage = genotype_storage_db.get_genotype_storage(
        argv.genotype_storage)

    if not genotype_storage or not genotype_storage.is_impala():
        logger.error("missing or non-impala genotype storage")
        return

    study_id = argv.study_id

    if argv.variants is not None:
        hdfs_variants_dir = argv.variants
    elif argv.variants_sample or argv.variants_schema:
        hdfs_variants_dir = \
            genotype_storage.default_variants_hdfs_dirname(study_id)
        # if not genotype_storage.hdfs_helpers.exists(hdfs_variants_dir):
        #     hdfs_variants_dir = None
    else:
        hdfs_variants_dir = None

    if argv.pedigree is not None:
        hdfs_pedigree_file = argv.pedigree
    else:
        hdfs_pedigree_file = \
            genotype_storage.default_pedigree_hdfs_filename(study_id)

    logger.info(f"HDFS variants dir: {hdfs_variants_dir}")
    logger.info(f"HDFS pedigree file: {hdfs_pedigree_file}")

    partition_config_file = None
    if argv.partition_description is not None:
        partition_config_file = argv.partition_description
        assert os.path.isfile(partition_config_file), partition_config_file
    logger.info(f"partition_config_file: {partition_config_file}")

    if partition_config_file is not None and \
            os.path.isfile(partition_config_file):
        partition_description = ParquetPartitionDescriptor.from_config(
            partition_config_file)
    else:
        partition_description = NoPartitionDescriptor()

    variants_schema = None
    if argv.variants_schema is not None:
        assert os.path.exists(argv.variants_schema), argv.variants_schema
        assert os.path.isfile(argv.variants_schema), argv.variants_schema
        with open(argv.variants_schema) as infile:
            content = infile.read()
            schema = toml.loads(content)
            variants_schema = schema["variants_schema"]

    genotype_storage.impala_import_dataset(
        argv.study_id,
        hdfs_pedigree_file,
        hdfs_variants_dir,
        partition_description=partition_description,
        variants_sample=argv.variants_sample,
        variants_schema=variants_schema)
Esempio n. 10
0
#!/bin/env python

import re
import sys
import csv

from dae.gpf_instance.gpf_instance import GPFInstance

gpf_instance = GPFInstance()
genomes_db = gpf_instance.genomes_db

GENOME = genomes_db.get_genome()

subRE = re.compile(r"^sub\(([ACGT])->([ACGT])\)$")
insRE = re.compile(r"^ins\(([ACGT]+)\)$")
delRE = re.compile(r"^del\((\d+)\)$")


def vcfVarFormat(loc, var):
    chrom, pos = loc.split(":")
    pos = int(pos)

    mS = subRE.match(var)
    if mS:
        return chrom, pos, mS.group(1), mS.group(2)

    mI = insRE.match(var)
    if mI:
        sq = mI.group(1)
        rfS = GENOME.get_sequence(chrom, pos - 1, pos - 1)
        return chrom, pos - 1, rfS, rfS + sq
Esempio n. 11
0
def main(argv=sys.argv[1:], gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    argv = parse_cli_arguments(argv, gpf_instance)

    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    logging.getLogger("impala").setLevel(logging.WARNING)

    if argv.studies is None:
        study_ids = [
            gd.study_id for gd in gpf_instance.get_all_genotype_data()
            if not gd.is_group
        ]
    else:
        study_ids = [sid.strip() for sid in argv.studies.split(",")]

    logger.info(f"computing table stats for studies: {study_ids}")

    for study_id in study_ids:
        study = gpf_instance.get_genotype_data(study_id)
        assert study.study_id == study_id

        study_backend = study._backend
        if not isinstance(study_backend, ImpalaVariants):
            logger.info(f"not an impala study: {study_id}; skipping...")
            continue

        pedigree_compute_stats(study_backend)
        if study_backend.variants_table is None:
            continue

        if "region_bin" not in study_backend.schema:
            variants_compute_stats(study_backend, region_bin=None)
            if study_backend.has_summary_variants_table:
                summary_variants_compute_stats(study_backend, region_bin=None)
        else:
            assert "region_bin" in study_backend.schema
            region_bins = variants_region_bins(study_backend)
            logger.info(
                f"processing  {len(region_bins)} region bins; {region_bins}")

            for index, region_bin in enumerate(region_bins):
                start = time.time()
                variants_compute_stats(study_backend, region_bin)

                if study_backend.has_summary_variants_table:
                    summary_variants_compute_stats(study_backend, region_bin)

                elapsed = time.time() - start
                logger.info(
                    f"computing stats {index}/{len(region_bins)} "
                    f"for {study_backend.db}.{study_backend.variants_table}; "
                    f"{elapsed:.2f} secs")
Esempio n. 12
0
def local_gpf_instance(remote_dir):
    return GPFInstance(work_dir=remote_dir)
Esempio n. 13
0
def pipeline_main(argv):
    gpf_instance = GPFInstance()
    dae_config = gpf_instance.dae_config
    genomes_db = gpf_instance.genomes_db

    desc = "Program to annotate variants combining multiple annotating tools"
    parser = argparse.ArgumentParser(
        description=desc,
        conflict_handler="resolve",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('--verbose', '-V', action='count', default=0)

    for name, args in main_cli_options(gpf_instance):
        parser.add_argument(name, **args)

    options = parser.parse_args()

    if options.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif options.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif options.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)

    if options.annotation_config is not None:
        config_filename = options.annotation_config
    else:
        config_filename = dae_config.annotation.conf_file

    assert os.path.exists(config_filename), config_filename

    options = Box(
        {k: v
         for k, v in options._get_kwargs()},
        default_box=True,
        default_box_attr=None,
    )

    # File IO format specification
    reader_type = IOType.TSV
    writer_type = IOType.TSV
    if options.read_parquet:
        reader_type = IOType.Parquet
    if options.write_parquet:
        writer_type = IOType.Parquet

    start = time.time()

    pipeline = PipelineAnnotator.build(
        options,
        config_filename,
        genomes_db,
    )
    assert pipeline is not None

    with IOManager(options, reader_type, writer_type) as io_manager:
        pipeline.annotate_file(io_manager)

    print("# PROCESSING DETAILS:", file=sys.stderr)
    print("#", time.asctime(), file=sys.stderr)
    print("#", " ".join(sys.argv[1:]), file=sys.stderr)

    print(
        "The program was running for [h:m:s]:",
        str(datetime.timedelta(seconds=round(time.time() - start, 0))),
        file=sys.stderr,
    )

    if options.tabix:
        run_tabix(options.outfile)
Esempio n. 14
0
def main(argv, gpf_instance=None):
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    parser = argparse.ArgumentParser()
    parser.add_argument('--verbose', '-V', action='count', default=0)

    FamiliesLoader.cli_arguments(parser)
    VcfLoader.cli_arguments(parser, options_only=True)

    parser.add_argument(
        "-o",
        "--output",
        dest="output_filename",
        help="output families parquet filename "
        "(default is [basename(families_filename).ped])",
    )
    parser.add_argument(
        "--partition-description",
        "--pd",
        help="input partition description filename",
    )
    parser.add_argument(
        "--vcf-files",
        type=str,
        nargs="+",
        metavar="<VCF filename>",
        help="VCF file to import",
    )

    argv = parser.parse_args(argv)
    if argv.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif argv.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif argv.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    filename, params = FamiliesLoader.parse_cli_arguments(argv)
    logger.info(F"PED PARAMS: {params}")

    loader = FamiliesLoader(filename, **params)
    families = loader.load()

    if argv.partition_description:
        partition_description = ParquetPartitionDescriptor.from_config(
            argv.partition_description)
        families = partition_description.add_family_bins_to_families(families)

    variants_filenames, variants_params = \
        VcfLoader.parse_cli_arguments(argv)

    if variants_filenames:
        assert variants_filenames is not None

        variants_loader = VcfLoader(
            families,
            variants_filenames,
            params=variants_params,
            genome=gpf_instance.genomes_db.get_genome(),
        )

        families = variants_loader.families

    if families.broken_families:
        for family_id, family in families.broken_families.items():
            if not family.has_members():
                del families[family_id]
                logger.warning(
                    f"family {family_id} does not contain sequenced members "
                    f"and is removed from the pedigree: {family}")

    if not argv.output_filename:
        output_filename, _ = os.path.splitext(os.path.basename(filename))
        output_filename = f"{output_filename}.ped"
    else:
        output_filename = argv.output_filename

    FamiliesLoader.save_pedigree(families, output_filename)