コード例 #1
0
def test_get_profiles(real_mongo_adapter, profile_vcf_path, zipped_vcf_path):
    # Load profile variants
    load_profile_variants(real_mongo_adapter, profile_vcf_path)

    vcf_info = check_vcf(zipped_vcf_path)

    # Get profiles from vcf
    profiles = get_profiles(real_mongo_adapter, zipped_vcf_path)

    # Assert that all individuals are included
    assert list(profiles.keys()) == vcf_info["individuals"]

    # Assert that profile strings are of same lengths
    for i, individual in enumerate(profiles.keys()):
        if i == 0:
            length = len(profiles[individual])
        assert len(profiles[individual]) == length
コード例 #2
0
def load_profile(ctx, load, variant_file, update, stats, profile_threshold,
                 check_vcf):
    """
        Command for profiling of samples. User may upload variants used in profiling
        from a vcf, update the profiles for all samples, and get some stats
        from the profiles in the database.

        Profiling is used to monitor duplicates in the database. The profile is
        based on the variants in the 'profile_variant' collection, assessing
        the genotypes for each sample at the position of these variants.
    """

    adapter = ctx.obj['adapter']

    LOG.info("Running loqusdb profile")

    if check_vcf:
        LOG.info(f"Check if profile in {check_vcf} has match in database")
        vcf_file = check_vcf
        profiles = get_profiles(adapter, vcf_file)
        duplicate = check_duplicates(adapter, profiles, profile_threshold)

        if duplicate is not None:
            duplicate = json.dumps(duplicate)
            click.echo(duplicate)
        else:
            LOG.info("No duplicates found in the database")

    if load:
        genome_build = ctx.obj['genome_build']
        vcf_path = MAF_PATH[genome_build]
        if variant_file is not None:
            vcf_path = variant_file
        LOG.info(f"Loads variants in {vcf_path} to be used in profiling")
        load_profile_variants(adapter, vcf_path)

    if update:
        LOG.info("Updates profiles in database")
        update_profiles(adapter)

    if stats:
        LOG.info("Prints profile stats")
        distance_dict = profile_stats(adapter, threshold=profile_threshold)
        click.echo(table_from_dict(distance_dict))
コード例 #3
0
ファイル: load.py プロジェクト: Clinical-Genomics/loqusdb
def load_database(
    adapter,
    variant_file=None,
    sv_file=None,
    family_file=None,
    family_type="ped",
    skip_case_id=False,
    gq_treshold=None,
    case_id=None,
    max_window=3000,
    profile_file=None,
    hard_threshold=0.95,
    soft_threshold=0.9,
    genome_build=None,
):
    """Load the database with a case and its variants

    Args:
          adapter: Connection to database
          variant_file(str): Path to variant file
          sv_file(str): Path to sv variant file
          family_file(str): Path to family file
          family_type(str): Format of family file
          skip_case_id(bool): If no case information should be added to variants
          gq_treshold(int): If only quality variants should be considered
          case_id(str): If different case id than the one in family file should be used
          max_window(int): Specify the max size for sv windows
          check_profile(bool): Does profile check if True
          hard_threshold(float): Rejects load if hamming distance above this is found
          soft_threshold(float): Stores similar samples if hamming distance above this is found

    Returns:
          nr_inserted(int)
    """
    vcf_files = []

    nr_variants = None
    vcf_individuals = None
    if variant_file:
        vcf_info = check_vcf(variant_file)
        nr_variants = vcf_info["nr_variants"]
        variant_type = vcf_info["variant_type"]
        vcf_files.append(variant_file)
        # Get the indivuduals that are present in vcf file
        vcf_individuals = vcf_info["individuals"]

    nr_sv_variants = None
    sv_individuals = None
    if sv_file:
        vcf_info = check_vcf(sv_file, "sv")
        nr_sv_variants = vcf_info["nr_variants"]
        vcf_files.append(sv_file)
        sv_individuals = vcf_info["individuals"]

    profiles = None
    matches = None
    if profile_file:
        profiles = get_profiles(adapter, profile_file)
        ###Check if any profile already exists
        matches = profile_match(adapter,
                                profiles,
                                hard_threshold=hard_threshold,
                                soft_threshold=soft_threshold)

    # If a gq treshold is used the variants needs to have GQ
    for _vcf_file in vcf_files:
        # Get a cyvcf2.VCF object
        vcf = get_vcf(_vcf_file)

        if gq_treshold and not vcf.contains("GQ"):
            LOG.warning("Set gq-treshold to 0 or add info to vcf {0}".format(
                _vcf_file))
            raise SyntaxError("GQ is not defined in vcf header")

    # Get a ped_parser.Family object from family file
    family = None
    family_id = None
    if family_file:
        LOG.info("Loading family from %s", family_file)
        with open(family_file, "r") as family_lines:
            family = get_case(family_lines=family_lines,
                              family_type=family_type)
            family_id = family.family_id

    # There has to be a case_id or a family at this stage.
    case_id = case_id or family_id
    # Convert infromation to a loqusdb Case object
    case_obj = build_case(
        case=family,
        case_id=case_id,
        vcf_path=variant_file,
        vcf_individuals=vcf_individuals,
        nr_variants=nr_variants,
        vcf_sv_path=sv_file,
        sv_individuals=sv_individuals,
        nr_sv_variants=nr_sv_variants,
        profiles=profiles,
        matches=matches,
        profile_path=profile_file,
    )
    # Build and load a new case, or update an existing one
    load_case(
        adapter=adapter,
        case_obj=case_obj,
    )

    nr_inserted = 0
    # If case was succesfully added we can store the variants
    for file_type in ["vcf_path", "vcf_sv_path"]:
        variant_type = "snv"
        if file_type == "vcf_sv_path":
            variant_type = "sv"
        if case_obj.get(file_type) is None:
            continue

        vcf_obj = get_vcf(case_obj[file_type])
        try:
            nr_inserted += load_variants(
                adapter=adapter,
                vcf_obj=vcf_obj,
                case_obj=case_obj,
                skip_case_id=skip_case_id,
                gq_treshold=gq_treshold,
                max_window=max_window,
                variant_type=variant_type,
                genome_build=genome_build,
            )
        except Exception as err:
            # If something went wrong do a rollback
            LOG.warning(err)
            delete(
                adapter=adapter,
                case_obj=case_obj,
            )
            raise err
    return nr_inserted