Ejemplo n.º 1
0
def test_check_vcf_wrong_type(sv_vcf_path):
    ## GIVEN a sv vcf file
    
    ## WHEN collecting the VCF info with wrong variant type
    ## THEN assert that a VcfError is raised
    with pytest.raises(VcfError):
        vcf_info = check_vcf(sv_vcf_path, 'snv')
Ejemplo n.º 2
0
def test_check_sv_vcf(sv_vcf_path):
    ## GIVEN a vcf file and a counter that checks the number of variants
    true_nr = 0

    with open(sv_vcf_path, "r") as f:
        for line in f:
            if not line.startswith("#"):
                true_nr += 1

    ## WHEN collecting the VCF info
    vcf_info = check_vcf(sv_vcf_path, "sv")

    ## THEN assert that the number of variants collected is correct
    assert vcf_info["nr_variants"] == true_nr
    ## THEN assert that the variant type is correct
    assert vcf_info["variant_type"] == "sv"
Ejemplo n.º 3
0
def test_check_vcf_correct(vcf_path):
    ## GIVEN a vcf file and a counter that checks the number of variants
    true_nr = 0
    
    with open(vcf_path, 'r') as f:
        for line in f:
            if not line.startswith('#'):
                true_nr += 1
    
    ## WHEN collecting the VCF info
    vcf_info = check_vcf(vcf_path)
    
    ## THEN assert that the number of variants collected is correct
    assert vcf_info['nr_variants'] == true_nr
    ## THEN assert that the variant type is correct
    assert vcf_info['variant_type'] == 'snv'
Ejemplo n.º 4
0
def test_get_profiles(real_mongo_adapter, profile_vcf_path, zipped_vcf_path):
    # Load profile variants
    load_profile_variants(real_mongo_adapter, profile_vcf_path)

    vcf_info = check_vcf(zipped_vcf_path)

    # Get profiles from vcf
    profiles = get_profiles(real_mongo_adapter, zipped_vcf_path)

    # Assert that all individuals are included
    assert list(profiles.keys()) == vcf_info["individuals"]

    # Assert that profile strings are of same lengths
    for i, individual in enumerate(profiles.keys()):
        if i == 0:
            length = len(profiles[individual])
        assert len(profiles[individual]) == length
Ejemplo n.º 5
0
def test_check_vcf(vcf_path):
    ## GIVEN the path to a vcf
    nr_variants = 0
    vcf = VCF(vcf_path)
    inds = vcf.samples
    for var in vcf:
        nr_variants += 1
    ## WHEN checking the vcf
    vcf_info = check_vcf(vcf_path)

    ## THEN assert that the number of variants is correct
    assert vcf_info['nr_variants'] == nr_variants

    ## THEN assert that the individuals are returned
    assert vcf_info['individuals'] == inds

    ## THEN assert that the variant type is correct
    assert vcf_info['variant_type'] == 'snv'
Ejemplo n.º 6
0
def load_profile_variants(adapter, variant_file):
    """

    Loads variants used for profiling

    Args:
        adapter (loqusdb.plugins.Adapter): initialized plugin
        variant_file(str): Path to variant file


    """

    vcf_info = check_vcf(variant_file)
    nr_variants = vcf_info["nr_variants"]
    variant_type = vcf_info["variant_type"]

    if variant_type != "snv":
        LOG.critical("Variants used for profiling must be SNVs only")
        raise VcfError

    vcf = get_vcf(variant_file)

    profile_variants = [build_profile_variant(variant) for variant in vcf]
    adapter.add_profile_variants(profile_variants)
Ejemplo n.º 7
0
def test_check_vcf_unsorted(unsorted_vcf_path):
    ## GIVEN a vcf file with unsorted variants
    ## WHEN checking the vcf
    ## THEN assert that the function raises a VcfError
    with pytest.raises(VcfError):
        check_vcf(unsorted_vcf_path)
Ejemplo n.º 8
0
def test_check_vcf_double_variant(double_vcf_path):
    ## GIVEN a variant file where a variant is duplicated
    ## WHEN checking the vcf
    ## THEN assert that the function raises a VcfError
    with pytest.raises(VcfError):
        check_vcf(double_vcf_path)
Ejemplo n.º 9
0
def load_database(
    adapter,
    variant_file=None,
    sv_file=None,
    family_file=None,
    family_type="ped",
    skip_case_id=False,
    gq_treshold=None,
    case_id=None,
    max_window=3000,
    profile_file=None,
    hard_threshold=0.95,
    soft_threshold=0.9,
    genome_build=None,
):
    """Load the database with a case and its variants

    Args:
          adapter: Connection to database
          variant_file(str): Path to variant file
          sv_file(str): Path to sv variant file
          family_file(str): Path to family file
          family_type(str): Format of family file
          skip_case_id(bool): If no case information should be added to variants
          gq_treshold(int): If only quality variants should be considered
          case_id(str): If different case id than the one in family file should be used
          max_window(int): Specify the max size for sv windows
          check_profile(bool): Does profile check if True
          hard_threshold(float): Rejects load if hamming distance above this is found
          soft_threshold(float): Stores similar samples if hamming distance above this is found

    Returns:
          nr_inserted(int)
    """
    vcf_files = []

    nr_variants = None
    vcf_individuals = None
    if variant_file:
        vcf_info = check_vcf(variant_file)
        nr_variants = vcf_info["nr_variants"]
        variant_type = vcf_info["variant_type"]
        vcf_files.append(variant_file)
        # Get the indivuduals that are present in vcf file
        vcf_individuals = vcf_info["individuals"]

    nr_sv_variants = None
    sv_individuals = None
    if sv_file:
        vcf_info = check_vcf(sv_file, "sv")
        nr_sv_variants = vcf_info["nr_variants"]
        vcf_files.append(sv_file)
        sv_individuals = vcf_info["individuals"]

    profiles = None
    matches = None
    if profile_file:
        profiles = get_profiles(adapter, profile_file)
        ###Check if any profile already exists
        matches = profile_match(adapter,
                                profiles,
                                hard_threshold=hard_threshold,
                                soft_threshold=soft_threshold)

    # If a gq treshold is used the variants needs to have GQ
    for _vcf_file in vcf_files:
        # Get a cyvcf2.VCF object
        vcf = get_vcf(_vcf_file)

        if gq_treshold and not vcf.contains("GQ"):
            LOG.warning("Set gq-treshold to 0 or add info to vcf {0}".format(
                _vcf_file))
            raise SyntaxError("GQ is not defined in vcf header")

    # Get a ped_parser.Family object from family file
    family = None
    family_id = None
    if family_file:
        LOG.info("Loading family from %s", family_file)
        with open(family_file, "r") as family_lines:
            family = get_case(family_lines=family_lines,
                              family_type=family_type)
            family_id = family.family_id

    # There has to be a case_id or a family at this stage.
    case_id = case_id or family_id
    # Convert infromation to a loqusdb Case object
    case_obj = build_case(
        case=family,
        case_id=case_id,
        vcf_path=variant_file,
        vcf_individuals=vcf_individuals,
        nr_variants=nr_variants,
        vcf_sv_path=sv_file,
        sv_individuals=sv_individuals,
        nr_sv_variants=nr_sv_variants,
        profiles=profiles,
        matches=matches,
        profile_path=profile_file,
    )
    # Build and load a new case, or update an existing one
    load_case(
        adapter=adapter,
        case_obj=case_obj,
    )

    nr_inserted = 0
    # If case was succesfully added we can store the variants
    for file_type in ["vcf_path", "vcf_sv_path"]:
        variant_type = "snv"
        if file_type == "vcf_sv_path":
            variant_type = "sv"
        if case_obj.get(file_type) is None:
            continue

        vcf_obj = get_vcf(case_obj[file_type])
        try:
            nr_inserted += load_variants(
                adapter=adapter,
                vcf_obj=vcf_obj,
                case_obj=case_obj,
                skip_case_id=skip_case_id,
                gq_treshold=gq_treshold,
                max_window=max_window,
                variant_type=variant_type,
                genome_build=genome_build,
            )
        except Exception as err:
            # If something went wrong do a rollback
            LOG.warning(err)
            delete(
                adapter=adapter,
                case_obj=case_obj,
            )
            raise err
    return nr_inserted
Ejemplo n.º 10
0
def update_database(
    adapter,
    variant_file=None,
    sv_file=None,
    family_file=None,
    family_type="ped",
    skip_case_id=False,
    gq_treshold=None,
    case_id=None,
    max_window=3000,
):
    """Update a case in the database

    Args:
          adapter: Connection to database
          variant_file(str): Path to variant file
          sv_file(str): Path to sv variant file
          family_file(str): Path to family file
          family_type(str): Format of family file
          skip_case_id(bool): If no case information should be added to variants
          gq_treshold(int): If only quality variants should be considered
          case_id(str): If different case id than the one in family file should be used
          max_window(int): Specify the max size for sv windows

    Returns:
          nr_inserted(int)
    """
    vcf_files = []
    nr_variants = None
    vcf_individuals = None
    if variant_file:
        vcf_info = check_vcf(variant_file)
        nr_variants = vcf_info["nr_variants"]
        variant_type = vcf_info["variant_type"]
        vcf_files.append(variant_file)
        # Get the indivuduals that are present in vcf file
        vcf_individuals = vcf_info["individuals"]

    nr_sv_variants = None
    sv_individuals = None
    if sv_file:
        vcf_info = check_vcf(sv_file, "sv")
        nr_sv_variants = vcf_info["nr_variants"]
        vcf_files.append(sv_file)
        sv_individuals = vcf_info["individuals"]

    # If a gq treshold is used the variants needs to have GQ
    for _vcf_file in vcf_files:
        # Get a cyvcf2.VCF object
        vcf = get_vcf(_vcf_file)

        if gq_treshold:
            if not vcf.contains("GQ"):
                LOG.warning(
                    "Set gq-treshold to 0 or add info to vcf {0}".format(
                        _vcf_file))
                raise SyntaxError("GQ is not defined in vcf header")

    # Get a ped_parser.Family object from family file
    family = None
    family_id = None
    if family_file:
        with open(family_file, "r") as family_lines:
            family = get_case(family_lines=family_lines,
                              family_type=family_type)
            family_id = family.family_id

    # There has to be a case_id or a family at this stage.
    case_id = case_id or family_id

    # Convert infromation to a loqusdb Case object
    case_obj = build_case(
        case=family,
        case_id=case_id,
        vcf_path=variant_file,
        vcf_individuals=vcf_individuals,
        nr_variants=nr_variants,
        vcf_sv_path=sv_file,
        sv_individuals=sv_individuals,
        nr_sv_variants=nr_sv_variants,
    )

    existing_case = adapter.case(case_obj)
    if not existing_case:
        raise CaseError("Case {} does not exist in database".format(
            case_obj["case_id"]))

    # Update the existing case in database
    case_obj = load_case(
        adapter=adapter,
        case_obj=case_obj,
        update=True,
    )

    nr_inserted = 0
    # If case was succesfully added we can store the variants
    for file_type in ["vcf_path", "vcf_sv_path"]:
        variant_type = "snv"
        if file_type == "vcf_sv_path":
            variant_type = "sv"
        if case_obj.get(file_type) is None:
            continue

        vcf_obj = get_vcf(case_obj[file_type])
        try:
            nr_inserted += load_variants(
                adapter=adapter,
                vcf_obj=vcf_obj,
                case_obj=case_obj,
                skip_case_id=skip_case_id,
                gq_treshold=gq_treshold,
                max_window=max_window,
                variant_type=variant_type,
            )
        except Exception as err:
            # If something went wrong do a rollback
            LOG.warning(err)
            delete(
                adapter=adapter,
                case_obj=case_obj,
                update=True,
                existing_case=existing_case,
            )
            raise err
    return nr_inserted