Esempio n. 1
0
def annotate_snvs(adapter, vcf_obj):
    """Annotate all variants in a VCF

    Args:
        adapter(loqusdb.plugin.adapter)
        vcf_obj(cyvcf2.VCF)

    Yields:
        variant(cyvcf2.Variant): Annotated variant
    """
    variants = {}

    for nr_variants, variant in enumerate(vcf_obj, 1):
        # Add the variant to current batch
        variants[get_variant_id(variant)] = variant
        # If batch len == 1000 we annotate the batch
        if (nr_variants % 1000) == 0:

            for var_obj in adapter.search_variants(list(variants.keys())):
                var_id = var_obj["_id"]
                if var_id in variants:
                    annotate_variant(variants[var_id], var_obj)

            for variant_id in variants:
                yield variants[variant_id]

            variants = {}

    for var_obj in adapter.search_variants(list(variants.keys())):
        var_id = var_obj["_id"]
        if var_id in variants:
            annotate_variant(variants[var_id], var_obj)

    for variant_id in variants:
        yield variants[variant_id]
Esempio n. 2
0
def test_delete_variant(real_mongo_adapter, het_variant, case_obj):
    ## GIVEN a database with one variant that is observed twice
    db = real_mongo_adapter.db
    case_id = case_obj["case_id"]

    db.variant.insert_one({
        "_id": get_variant_id(het_variant),
        "families": [case_id, "2"],
        "observations": 2,
    })

    mongo_variant = db.variant.find_one()
    assert mongo_variant["observations"] == 2

    ## WHEN deleting the variant for one case
    delete_variants(
        adapter=real_mongo_adapter,
        vcf_obj=[het_variant],
        case_obj=case_obj,
        case_id="2",
    )

    mongo_variant = db.variant.find_one()

    ## THEN assert that one case has been removed from 'families'
    assert mongo_variant["families"] == [case_id]
    ## THEN assert that the observation count is decreased
    assert mongo_variant["observations"] == 1
Esempio n. 3
0
def test_delete_variant(real_mongo_adapter, het_variant, case_obj):
    ## GIVEN a database with one variant that is observed twice
    db = real_mongo_adapter.db
    case_id = case_obj['case_id']

    db.variant.insert_one({
        '_id': get_variant_id(het_variant),
        'families': [case_id, '2'],
        'observations': 2,
    })

    mongo_variant = db.variant.find_one()
    assert mongo_variant['observations'] == 2

    ## WHEN deleting the variant for one case
    delete_variants(
        adapter=real_mongo_adapter,
        vcf_obj=[het_variant],
        case_obj=case_obj,
        case_id='2',
    )

    mongo_variant = db.variant.find_one()

    ## THEN assert that one case has been removed from 'families'
    assert mongo_variant['families'] == [case_id]
    ## THEN assert that the observation count is decreased
    assert mongo_variant['observations'] == 1
def build_profile_variant(variant):
    """Returns a ProfileVariant object

    Args:
        variant (cyvcf2.Variant)

    Returns:
        variant (models.ProfileVariant)
    """

    chrom = variant.CHROM
    if chrom.startswith(("chr", "CHR", "Chr")):
        chrom = chrom[3:]

    pos = int(variant.POS)

    variant_id = get_variant_id(variant)

    ref = variant.REF
    alt = variant.ALT[0]

    maf = get_maf(variant)

    profile_variant = ProfileVariant(
        variant_id=variant_id, chrom=chrom, pos=pos, ref=ref, alt=alt, maf=maf, id_column=variant.ID
    )

    return profile_variant
Esempio n. 5
0
def insert_sv_variants(adapter: MongoAdapter, case_obj: Case) -> None:
    """Build sv_variant documents and insert them into database on the fly, one at a time"""

    for variant in VCF(case_obj.vcf_sv_path, threads=settings.cyvcf_threads):
        variant_id = get_variant_id(variant=variant)
        ref = variant.REF
        alt = variant.ALT[0]
        coordinates = get_coords(variant)
        chrom = coordinates["chrom"]
        pos = coordinates["pos"]

        variant_obj = Variant(
            variant_id=variant_id,
            chrom=chrom,
            pos=pos,
            end=coordinates["end"],
            ref=ref,
            alt=alt,
            end_chrom=coordinates["end_chrom"],
            sv_type=coordinates["sv_type"],
            sv_len=coordinates["sv_length"],
            case_id=case_obj.case_id,
            homozygote=0,
            hemizygote=0,
            is_sv=True,
            id_column=variant.ID,
        )
        adapter.add_structural_variant(variant=variant_obj, max_window=settings.load_sv_window)
Esempio n. 6
0
def get_profiles(adapter: MongoAdapter, vcf_file: str) -> Dict[str, str]:
    """
    Reads VCF file containing one or more samples.
    Creates a dictionary where each sample ID from VCF file is a key.
    Retrieves coordinates for each variant from loqusdb.profile_variants
    Adds each variant of each sample as value of the dictionary.
    Returns a dictionary :
    {SAMPLE_ID : [var1, var2, ..., var50]}

    """

    vcf = VCF(vcf_file, threads=settings.cyvcf_threads)
    individuals = vcf.samples
    profiles = {individual: [] for individual in individuals}

    for profile_variant in adapter.profile_variants():

        ref = profile_variant["ref"]
        alt = profile_variant["alt"]

        pos = profile_variant["pos"]
        end = pos + 1
        chrom = profile_variant["chrom"]

        region = f"{chrom}:{pos}-{end}"

        # Find variants in region

        found_variant = False
        for variant in vcf(region):

            variant_id = get_variant_id(variant)

            # If variant id i.e. chrom_pos_ref_alt matches
            if variant_id == profile_variant["_id"]:
                found_variant = True
                # find genotype for each individual in vcf
                for i, individual in enumerate(individuals):

                    genotype = GENOTYPE_MAP[variant.gt_types[i]]
                    if genotype == "hom_alt":
                        gt_str = f"{alt}{alt}"
                    elif genotype == "het":
                        gt_str = f"{ref}{alt}"
                    else:
                        gt_str = f"{ref}{ref}"

                    # Append genotype to profile string of individual
                    profiles[individual].append(gt_str)

                # Break loop if variant is found in region
                break

        # If no call was found for variant, give all samples a hom ref genotype
        if not found_variant:
            for individual in individuals:
                profiles[individual].append(f"{ref}{ref}")

    return profiles
Esempio n. 7
0
def annotate_snv(adpter, variant):
    """Annotate an SNV/INDEL variant

    Args:
        adapter(loqusdb.plugin.adapter)
        variant(cyvcf2.Variant)
    """
    variant_id = get_variant_id(variant)
    variant_obj = adapter.get_variant(variant={"_id": variant_id})

    annotated_variant = annotated_variant(variant, variant_obj)
    return annotated_variant
Esempio n. 8
0
def insert_snv_variants(adapter: MongoAdapter, case_obj: Case) -> None:
    """Build variant documents and bulk insert them into database"""
    variants = []
    for variant in VCF(case_obj.vcf_path, threads=settings.cyvcf_threads):
        variant_id = get_variant_id(variant=variant)
        ref = variant.REF
        alt = variant.ALT[0]

        coordinates = get_coords(variant)
        chrom = coordinates["chrom"]
        pos = coordinates["pos"]
        found_homozygote = 0
        found_hemizygote = 0

        for ind_obj in case_obj.individuals:
            ind_pos = ind_obj["ind_index"]
            if int(variant.gt_quals[ind_pos]) < settings.load_gq_threshold:
                continue

            genotype = GENOTYPE_MAP[variant.gt_types[ind_pos]]
            if genotype not in ["het", "hom_alt"]:
                continue

            if genotype == "hom_alt":
                found_homozygote = 1

            if (
                chrom in ["X", "Y"]
                and ind_obj["sex"] == 1
                and not check_par(chrom, pos, genome_build=settings.genome_build)
            ):
                found_hemizygote = 1

            variant_obj = Variant(
                variant_id=variant_id,
                chrom=chrom,
                pos=pos,
                end=coordinates["end"],
                ref=ref,
                alt=alt,
                end_chrom=coordinates["end_chrom"],
                sv_type=coordinates["sv_type"],
                sv_len=coordinates["sv_length"],
                case_id=case_obj.case_id,
                homozygote=found_homozygote,
                hemizygote=found_hemizygote,
                is_sv=False,
                id_column=variant.ID,
            )
            variants.append(variant_obj)
    adapter.add_variants(variants=variants)
Esempio n. 9
0
def test_delete_variants(real_mongo_adapter, het_variant, case_obj):
    ## GIVEN a database with one variant
    db = real_mongo_adapter.db
    case_id = case_obj["case_id"]

    db.variant.insert_one({
        "_id": get_variant_id(het_variant),
        "families": [case_id],
        "observations": 1,
    })

    mongo_variant = db.variant.find_one()
    assert mongo_variant["families"] == [case_id]

    ## WHEN deleting the variant
    delete_variants(adapter=real_mongo_adapter,
                    vcf_obj=[het_variant],
                    case_obj=case_obj)

    mongo_variant = db.variant.find_one()

    ## THEN assert that the variant was not found
    assert mongo_variant is None
Esempio n. 10
0
def test_delete_variants(real_mongo_adapter, het_variant, case_obj):
    ## GIVEN a database with one variant
    db = real_mongo_adapter.db
    case_id = case_obj['case_id']

    db.variant.insert_one({
        '_id': get_variant_id(het_variant),
        'families': [case_id],
        'observations': 1,
    })

    mongo_variant = db.variant.find_one()
    assert mongo_variant['families'] == [case_id]

    ## WHEN deleting the variant
    delete_variants(adapter=real_mongo_adapter,
                    vcf_obj=[het_variant],
                    case_obj=case_obj)

    mongo_variant = db.variant.find_one()

    ## THEN assert that the variant was not found
    assert mongo_variant == None
Esempio n. 11
0
def get_profiles(adapter, vcf_file):
    """Given a vcf, get a profile string for each sample in the vcf
    based on the profile variants in the database

    Args:
        adapter(MongoAdapter): Adapter to mongodb
        vcf_file(str): Path to vcf file

    Returns:
        profiles (dict(str)): The profiles (given as strings) for each sample
                              in vcf.

    """
    vcf = get_file_handle(vcf_file)
    individuals = vcf.samples
    profiles = {individual: [] for individual in individuals}

    for profile_variant in adapter.profile_variants():

        ref = profile_variant["ref"]
        alt = profile_variant["alt"]

        pos = profile_variant["pos"]
        end = pos + 1
        chrom = profile_variant["chrom"]

        region = f"{chrom}:{pos}-{end}"

        # Find variants in region

        found_variant = False
        for variant in vcf(region):

            variant_id = get_variant_id(variant)

            # If variant id i.e. chrom_pos_ref_alt matches
            if variant_id == profile_variant["_id"]:
                found_variant = True
                # find genotype for each individual in vcf
                for i, individual in enumerate(individuals):

                    genotype = GENOTYPE_MAP[variant.gt_types[i]]
                    if genotype == "hom_alt":
                        gt_str = f"{alt}{alt}"
                    elif genotype == "het":
                        gt_str = f"{ref}{alt}"
                    else:
                        gt_str = f"{ref}{ref}"

                    # Append genotype to profile string of individual
                    profiles[individual].append(gt_str)

                # Break loop if variant is found in region
                break

        # If no call was found for variant, give all samples a hom ref genotype
        if not found_variant:
            for individual in individuals:
                profiles[individual].append(f"{ref}{ref}")

    return profiles
Esempio n. 12
0
def check_vcf(vcf_path, expected_type="snv"):
    """Check if there are any problems with the vcf file

    Args:
        vcf_path(str)
        expected_type(str): 'sv' or 'snv'

    Returns:
        vcf_info(dict): dict like
        {
            'nr_variants':<INT>,
            'variant_type': <STR> in ['snv', 'sv'],
            'individuals': <LIST> individual positions in file
        }
    """
    LOG.info("Check if vcf is on correct format...")

    vcf = VCF(vcf_path)
    individuals = vcf.samples
    variant_type = None

    previous_pos = None
    previous_chrom = None

    posititon_variants = set()

    nr_variants = 0
    for nr_variants, variant in enumerate(vcf, 1):

        # Check the type of variant
        current_type = "sv" if variant.var_type == "sv" else "snv"
        if not variant_type:
            variant_type = current_type

        # Vcf can not include both snvs and svs
        if variant_type != current_type:
            raise VcfError("Vcf includes a mix of snvs and svs")

        current_chrom = variant.CHROM
        current_pos = variant.POS

        # We start with a simple id that can be used by SV:s
        variant_id = "{0}_{1}".format(current_chrom, current_pos)
        # For SNVs we can create a proper variant id with chrom_pos_ref_alt
        if variant_type == "snv":
            variant_id = get_variant_id(variant)

        # Initiate variables
        if not previous_chrom:
            previous_chrom = current_chrom
            previous_pos = current_pos
            posititon_variants = {variant_id}
            continue

        # Update variables if new chromosome
        if current_chrom != previous_chrom:
            previous_chrom = current_chrom
            previous_pos = current_pos
            posititon_variants = {variant_id}
            continue

        if variant_type == "snv":
            # Check if variant is unique
            if current_pos == previous_pos:
                if variant_id in posititon_variants:
                    raise VcfError("Variant {0} occurs several times" " in vcf".format(variant_id))
                else:
                    posititon_variants.add(variant_id)
            # Check if vcf is sorted
            else:
                if not current_pos >= previous_pos:
                    raise VcfError("Vcf if not sorted in a correct way")
                previous_pos = current_pos
                # Reset posititon_variants since we are on a new position
                posititon_variants = {variant_id}

    if variant_type != expected_type:
        raise VcfError(
            "VCF file does not only include {0}s, please check vcf {1}".format(
                expected_type.upper(), vcf_path
            )
        )

    LOG.info("Vcf file %s looks fine", vcf_path)
    LOG.info("Nr of variants in vcf: {0}".format(nr_variants))
    LOG.info("Type of variants in vcf: {0}".format(variant_type))

    return {
        "nr_variants": nr_variants,
        "variant_type": variant_type,
        "individuals": individuals,
    }