def test_parse_rank_results_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the rank results header
    rank_results_header = parse_rank_results_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(rank_results_header, list)
    assert rank_results_header
    assert "Consequence" in rank_results_header
def test_parse_rank_results_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the rank results header
    rank_results_header = parse_rank_results_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(rank_results_header, list)
    assert rank_results_header
    assert 'Consequence' in rank_results_header
Beispiel #3
0
def rank_results_header(request, variant_clinical_file):
    LOG.info("Return a VCF parser with one variant")
    variants = VCF(variant_clinical_file)
    rank_results = parse_rank_results_header(variants)

    return rank_results
    def load_variants(
        self,
        case_obj,
        variant_type="clinical",
        category="snv",
        rank_threshold=None,
        chrom=None,
        start=None,
        end=None,
        gene_obj=None,
        build="37",
    ):
        """Load variants for a case into scout.

        Load the variants for a specific analysis type and category into scout.
        If no region is specified, load all variants above rank score threshold
        If region or gene is specified, load all variants from that region
        disregarding variant rank(if not specified)

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv', 'str' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 0
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            nr_inserted(int)
        """
        # We need the institute object
        institute_id = self.institute(institute_id=case_obj["owner"])["_id"]
        nr_inserted = 0

        variant_file = None
        if variant_type == "clinical":
            if category == "snv":
                variant_file = case_obj["vcf_files"].get("vcf_snv")
            elif category == "sv":
                variant_file = case_obj["vcf_files"].get("vcf_sv")
            elif category == "str":
                LOG.debug("Attempt to load STR VCF.")
                variant_file = case_obj["vcf_files"].get("vcf_str")
            elif category == "cancer":
                # Currently this implies a paired tumor normal
                variant_file = case_obj["vcf_files"].get("vcf_cancer")
            elif category == "cancer_sv":
                # ditto for paired tumor normal
                variant_file = case_obj["vcf_files"].get("vcf_cancer_sv")
        elif variant_type == "research":
            if category == "snv":
                variant_file = case_obj["vcf_files"].get("vcf_snv_research")
            elif category == "sv":
                variant_file = case_obj["vcf_files"].get("vcf_sv_research")
            elif category == "cancer":
                variant_file = case_obj["vcf_files"].get("vcf_cancer_research")
            elif category == "cancer_sv":
                variant_file = case_obj["vcf_files"].get(
                    "vcf_cancer_sv_research")

        if not variant_file:
            raise SyntaxError("Vcf file does not seem to exist")

        # Check if there are any variants in file
        try:
            vcf_obj = VCF(variant_file)
            var = next(vcf_obj)
        except StopIteration as err:
            LOG.warning("Variant file %s does not include any variants",
                        variant_file)
            return nr_inserted
        # We need to reload the file
        vcf_obj = VCF(variant_file)

        # Parse the neccessary headers from vcf file
        rank_results_header = parse_rank_results_header(vcf_obj)
        vep_header = parse_vep_header(vcf_obj)
        if vep_header:
            LOG.info("Found VEP header %s", "|".join(vep_header))

        # This is a dictionary to tell where ind are in vcf
        individual_positions = {}
        for i, ind in enumerate(vcf_obj.samples):
            individual_positions[ind] = i

        # Dictionary for cancer analysis
        sample_info = {}
        if category in ("cancer", "cancer_sv"):
            for ind in case_obj["individuals"]:
                if ind["phenotype"] == 2:
                    sample_info[ind["individual_id"]] = "case"
                else:
                    sample_info[ind["individual_id"]] = "control"

        # Check if a region scould be uploaded
        region = ""
        if gene_obj:
            chrom = gene_obj["chromosome"]
            # Add same padding as VEP
            start = max(gene_obj["start"] - 5000, 0)
            end = gene_obj["end"] + 5000
        if chrom:
            # We want to load all variants in the region regardless of rank score
            rank_threshold = rank_threshold or -1000
            if not (start and end):
                raise SyntaxError("Specify chrom start and end")
            region = "{0}:{1}-{2}".format(chrom, start, end)
        else:
            rank_threshold = rank_threshold or 0

        variants = vcf_obj(region)

        try:
            nr_inserted = self._load_variants(
                variants=variants,
                variant_type=variant_type,
                case_obj=case_obj,
                individual_positions=individual_positions,
                rank_threshold=rank_threshold,
                institute_id=institute_id,
                build=build,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                category=category,
                sample_info=sample_info,
            )
        except Exception as error:
            LOG.exception("unexpected error")
            LOG.warning("Deleting inserted variants")
            self.delete_variants(case_obj["_id"], variant_type)
            raise error

        self.update_variant_rank(case_obj, variant_type, category=category)

        return nr_inserted
    def load_variants(self, case_obj, variant_type='clinical', category='snv',
                      rank_threshold=None, chrom=None, start=None, end=None,
                      gene_obj=None, build='37'):
        """Load variants for a case into scout.

        Load the variants for a specific analysis type and category into scout.
        If no region is specified, load all variants above rank score threshold
        If region or gene is specified, load all variants from that region
        disregarding variant rank(if not specified)

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv', 'str' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 0
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            nr_inserted(int)
        """
        # We need the institute object
        institute_id = self.institute(institute_id=case_obj['owner'])['_id']
        nr_inserted = 0

        variant_file = None
        if variant_type == 'clinical':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv')
            elif category == 'str':
                LOG.debug('Attempt to load STR VCF.')
                variant_file = case_obj['vcf_files'].get('vcf_str')
            elif category == 'cancer':
                # Currently this implies a paired tumor normal
                variant_file = case_obj['vcf_files'].get('vcf_cancer')
        elif variant_type == 'research':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv_research')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv_research')
            elif category == 'cancer':
                variant_file = case_obj['vcf_files'].get('vcf_cancer_research')

        if not variant_file:
            raise SyntaxError("Vcf file does not seem to exist")

        # Check if there are any variants in file
        try:
            vcf_obj = VCF(variant_file)
            var = next(vcf_obj)
        except StopIteration as err:
            LOG.warning("Variant file %s does not include any variants", variant_file)
            return nr_inserted
        # We need to reload the file
        vcf_obj = VCF(variant_file)

        # Parse the neccessary headers from vcf file
        rank_results_header = parse_rank_results_header(vcf_obj)
        vep_header = parse_vep_header(vcf_obj)

        # This is a dictionary to tell where ind are in vcf
        individual_positions = {}
        for i, ind in enumerate(vcf_obj.samples):
            individual_positions[ind] = i
        
        # Dictionary for cancer analysis
        sample_info = {}
        if category == 'cancer':
            for ind in case_obj['individuals']:
                if ind['phenotype'] == 2:
                    sample_info[ind['individual_id']] = 'case'
                else:
                    sample_info[ind['individual_id']] = 'control'

        # Check if a region scould be uploaded
        region = ""
        if gene_obj:
            chrom = gene_obj['chromosome']
            # Add same padding as VEP
            start = max(gene_obj['start'] - 5000, 0)
            end = gene_obj['end'] + 5000
        if chrom:
            # We want to load all variants in the region regardless of rank score
            rank_threshold = rank_threshold or -1000
            if not (start and end):
                raise SyntaxError("Specify chrom start and end")
            region = "{0}:{1}-{2}".format(chrom, start, end)
        else:
            rank_threshold = rank_threshold or 0

        variants = vcf_obj(region)

        try:
            nr_inserted = self._load_variants(
                variants=variants,
                variant_type=variant_type,
                case_obj=case_obj,
                individual_positions=individual_positions,
                rank_threshold=rank_threshold,
                institute_id=institute_id,
                build=build,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                category=category,
                sample_info = sample_info
            )
        except Exception as error:
            LOG.exception('unexpected error')
            LOG.warning("Deleting inserted variants")
            self.delete_variants(case_obj['_id'], variant_type)
            raise error

        self.update_variant_rank(case_obj, variant_type, category=category)

        return nr_inserted
Beispiel #6
0
def load_variants(adapter, variant_file, case_obj, variant_type='clinical',
                  category='snv', rank_threshold=5, chrom=None, start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i,ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    logger.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    nr_variants = 0
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {
            'chrom': chrom,
            'start': start,
            'end': end
        }

    try:
        for nr_variants, variant in enumerate(vcf_obj):
            rank_score = parse_rank_score(
                variant.INFO.get('RankScore'),
                case_obj['display_name']
            )
            variant_obj = None
            add_variant = False

            if coordinates or (rank_score > rank_threshold):
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header = vep_header,
                    individual_positions = individual_positions
                )
                add_variant = True
                # If there are coordinates the variant should be loaded
                if coordinates:
                    if not check_coordinates(parsed_variant, coordinates):
                        add_variant = False

                if add_variant:
                    variant_obj = build_variant(
                        variant=parsed_variant,
                        institute_id=institute_obj['_id'],
                        gene_to_panels=gene_to_panels,
                        hgncid_to_gene=hgncid_to_gene,
                    )
                    try:
                        load_variant(adapter, variant_obj)
                        nr_inserted += 1
                    except IntegrityError as error:
                        pass

                if (nr_variants != 0 and nr_variants % 5000 == 0):
                    logger.info("%s variants parsed" % str(nr_variants))
                    logger.info("Time to parse variants: {} ".format(
                                datetime.now() - start_five_thousand))
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    logger.info("%s variants inserted" % nr_inserted)
                    inserted += 1

    except Exception as error:
        if not coordinates:
            logger.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    logger.info("All variants inserted.")
    logger.info("Number of variants in file: {0}".format(nr_variants + 1))
    logger.info("Number of variants inserted: {0}".format(nr_inserted))
    logger.info("Time to insert variants:{0}".format(datetime.now() - start_insertion))
def test_compounds_region(real_populated_database, case_obj,
                          variant_clinical_file):
    """When loading the variants not all variants will be loaded, only the ones that
       have a rank score above a treshold.
       This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = "clinical"
    category = "snv"
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find_one() is None

    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj["_id"]

    ## WHEN loading variants into the database without updating compound information

    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i, variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type="clinical",
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
            category="snv",
        )

        variant_obj = build_variant(variant=parsed_variant,
                                    institute_id=institute_id)
        variants.append(variant_obj)

    # Load all variants
    adapter.variant_collection.insert_many(variants)

    print("Nr variants: {0}".format(len(variants)))

    ## THEN assert that the variants does not have updated compound information
    nr_compounds = 0
    for var in adapter.variant_collection.find():
        if not var.get("compounds"):
            continue
        for comp in var["compounds"]:
            if "genes" in comp:
                assert False
            if "not_loaded" in comp:
                assert False
            nr_compounds += 1

    assert nr_compounds > 0

    ## WHEN updating all compounds for a case
    adapter.update_case_compounds(case_obj)
    hgnc_ids = set([gene["hgnc_id"] for gene in adapter.all_genes()])

    nr_compounds = 0
    ## THEN assert that all compounds  (within the gene defenition) are updated
    for var in adapter.variant_collection.find():
        cont = False
        for hgnc_id in var["hgnc_ids"]:
            if hgnc_id not in hgnc_ids:
                cont = True
        if cont:
            continue
        if not var.get("compounds"):
            continue
        for comp in var["compounds"]:
            nr_compounds += 1
            if not "genes" in comp:
                # pp(var)
                assert False
            if not "not_loaded" in comp:
                assert False
    assert nr_compounds > 0
Beispiel #8
0
    def load_variants(self,
                      case_obj,
                      variant_type='clinical',
                      category='snv',
                      rank_threshold=None,
                      chrom=None,
                      start=None,
                      end=None,
                      gene_obj=None,
                      build='37'):
        """Load variants for a case into scout.

        Load the variants for a specific analysis type and category into scout.
        If no region is specified, load all variants above rank score threshold
        If region or gene is specified, load all variants from that region
        disregarding variant rank(if not specified)

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 0
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            nr_inserted(int)
        """
        # We need the institute object
        institute_id = self.institute(institute_id=case_obj['owner'])['_id']

        variant_file = None
        if variant_type == 'clinical':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv')
            elif category == 'cancer':
                # Currently this implies a paired tumor normal
                variant_file = case_obj['vcf_files'].get('vcf_cancer')
        elif variant_type == 'research':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv_research')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv_research')
            elif category == 'cancer':
                variant_file = case_obj['vcf_files'].get('vcf_cancer_research')

        if not variant_file:
            raise SyntaxError("Vcf file does not seem to exist")

        vcf_obj = VCF(variant_file)

        # Parse the neccessary headers from vcf file
        rank_results_header = parse_rank_results_header(vcf_obj)
        vep_header = parse_vep_header(vcf_obj)

        # This is a dictionary to tell where ind are in vcf
        individual_positions = {}
        for i, ind in enumerate(vcf_obj.samples):
            individual_positions[ind] = i

        # Dictionary for cancer analysis
        sample_info = {}
        if category == 'cancer':
            for ind in case_obj['individuals']:
                if ind['phenotype'] == 2:
                    sample_info[ind['individual_id']] = 'case'
                else:
                    sample_info[ind['individual_id']] = 'control'

        # Check if a region scould be uploaded
        region = ""
        if gene_obj:
            chrom = gene_obj['chromosome']
            # Add same padding as VEP
            start = max(gene_obj['start'] - 5000, 0)
            end = gene_obj['end'] + 5000
        if chrom:
            # We want to load all variants in the region regardless of rank score
            rank_threshold = rank_threshold or -1000
            if not (start and end):
                raise SyntaxError("Specify chrom start and end")
            region = "{0}:{1}-{2}".format(chrom, start, end)
        else:
            rank_threshold = rank_threshold or 0

        variants = vcf_obj(region)

        try:
            nr_inserted = self._load_variants(
                variants=variants,
                variant_type=variant_type,
                case_obj=case_obj,
                individual_positions=individual_positions,
                rank_threshold=rank_threshold,
                institute_id=institute_id,
                build=build,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                category=category,
                sample_info=sample_info)
        except Exception as error:
            LOG.exception('unexpected error')
            LOG.warning("Deleting inserted variants")
            self.delete_variants(case_obj['_id'], variant_type)
            raise error

        self.update_variant_rank(case_obj, variant_type, category=category)

        return nr_inserted
Beispiel #9
0
    def load_variants(self, case_obj, variant_type='clinical', category='snv',
                      rank_threshold=None, chrom=None, start=None, end=None,
                      gene_obj=None):
        """Load variants for a case into scout.

        Load all variants for a specific analysis type and category into scout.
        If no region is specified, load all variants above rank score threshold
        If region or gene is specified, load all variants from that region
        disregarding variant rank(if not specified)

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 5
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            nr_inserted(int)
        """
        institute_obj = self.institute(institute_id=case_obj['owner'])

        if not institute_obj:
            raise IntegrityError("Institute {0} does not exist in"
                                 " database.".format(case_obj['owner']))
        gene_to_panels = self.gene_to_panels()

        hgncid_to_gene = self.hgncid_to_gene()

        variant_file = None
        if variant_type == 'clinical':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv')
        elif variant_type == 'research':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv_research')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv_research')

        if not variant_file:
            raise SyntaxError("Vcf file does not seem to exist")

        vcf_obj = VCF(variant_file)

        # Parse the neccessary headers from vcf file
        rank_results_header = parse_rank_results_header(vcf_obj)
        vep_header = parse_vep_header(vcf_obj)

        # This is a dictionary to tell where ind are in vcf
        individual_positions = {}
        for i,ind in enumerate(vcf_obj.samples):
            individual_positions[ind] = i

        # Check if a region scould be uploaded
        region = ""
        if gene_obj:
            chrom = gene_obj['chromosome']
            start = gene_obj['start']
            end = gene_obj['end']
        if chrom:
            rank_threshold = rank_threshold or -100
            if not (start and end):
                raise SyntaxError("Specify chrom start and end")
            region = "{0}:{1}-{2}".format(chrom, start, end)
        else:
            rank_threshold = rank_threshold or 5

        logger.info("Start inserting variants into database")
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets
        # inserted
        nr_inserted = 0
        # This is to keep track of the inserted variants
        inserted = 1

        try:
            for nr_variants, variant in enumerate(vcf_obj(region)):
                rank_score = parse_rank_score(
                    variant.INFO.get('RankScore'),
                    case_obj['display_name']
                )

                if rank_score > rank_threshold:
                    # Parse the vcf variant
                    parsed_variant = parse_variant(
                        variant=variant,
                        case=case_obj,
                        variant_type=variant_type,
                        rank_results_header=rank_results_header,
                        vep_header = vep_header,
                        individual_positions = individual_positions
                    )
                    # Build the variant object
                    variant_obj = build_variant(
                        variant=parsed_variant,
                        institute_id=institute_obj['_id'],
                        gene_to_panels=gene_to_panels,
                        hgncid_to_gene=hgncid_to_gene,
                    )
                    try:
                        self.load_variant(variant_obj)
                        nr_inserted += 1
                    except IntegrityError as error:
                        pass

                    if (nr_variants != 0 and nr_variants % 5000 == 0):
                        logger.info("%s variants parsed" % str(nr_variants))
                        logger.info("Time to parse variants: {} ".format(
                                    datetime.now() - start_five_thousand))
                        start_five_thousand = datetime.now()

                    if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                        logger.info("%s variants inserted" % nr_inserted)
                        inserted += 1

        except Exception as error:
            logger.warning("Deleting inserted variants")
            self.delete_variants(case_obj['_id'], variant_type)
            raise error

        return nr_inserted
Beispiel #10
0
def load_variants(adapter,
                  variant_file,
                  case_obj,
                  variant_type='clinical',
                  category='snv',
                  rank_threshold=6,
                  chrom=None,
                  start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    LOG.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    # To get it right if the file is empty
    nr_variants = -1
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {'chrom': chrom, 'start': start, 'end': end}

    try:
        for nr_variants, variant in enumerate(vcf_obj):

            # Get the neccesary coordinates
            # Parse away any chr CHR prefix
            chrom_match = CHR_PATTERN.match(variant.CHROM)
            chrom = chrom_match.group(2)
            position = variant.POS

            add_variant = False

            # If coordinates are specified we want to upload all variants that
            # resides within the specified region
            if coordinates:
                if check_coordinates(chrom, position, coordinates):
                    add_variant = True
            # If there are no coordinates we allways want to load MT variants
            elif chrom == 'MT':
                add_variant = True
            # Otherwise we need to check is rank score requirement are fulfilled
            else:
                rank_score = parse_rank_score(variant.INFO.get('RankScore'),
                                              case_obj['display_name'])
                if rank_score >= rank_threshold:
                    add_variant = True
            variant_obj = None

            # Log the number of variants parsed
            if (nr_variants != 0 and nr_variants % 5000 == 0):
                LOG.info("%s variants parsed" % str(nr_variants))
                LOG.info(
                    "Time to parse variants: {} ".format(datetime.now() -
                                                         start_five_thousand))
                start_five_thousand = datetime.now()

            if not add_variant:
                continue

            ####### Here we know that the variant should be loaded #########
            # We follow the scout paradigm of parse -> build -> load

            # Parse the variant
            parsed_variant = parse_variant(
                variant=variant,
                case=case_obj,
                variant_type=variant_type,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                individual_positions=individual_positions)

            # Build the variant object
            variant_obj = build_variant(
                variant=parsed_variant,
                institute_id=institute_obj['_id'],
                gene_to_panels=gene_to_panels,
                hgncid_to_gene=hgncid_to_gene,
            )

            # Load the variant abject
            # We could get integrity error here since if we want to load all variants of a region
            # there will likely already be variants from that region loaded
            try:
                load_variant(adapter, variant_obj)
                nr_inserted += 1
            except IntegrityError as error:
                pass

            # Log number of inserted variants
            if (nr_inserted != 0
                    and (nr_inserted * inserted) % (1000 * inserted) == 0):
                LOG.info("%s variants inserted" % nr_inserted)
                inserted += 1

    except Exception as error:
        if not coordinates:
            LOG.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    LOG.info("All variants inserted.")
    LOG.info("Number of variants in file: {0}".format(nr_variants + 1))
    LOG.info("Number of variants inserted: {0}".format(nr_inserted))
    LOG.info("Time to insert variants:{0}".format(datetime.now() -
                                                  start_insertion))
def test_compounds_region(real_populated_database, case_obj, variant_clinical_file):
    """When loading the variants not all variants will be loaded, only the ones that
       have a rank score above a treshold.
       This implies that some compounds will have the status 'not_loaded'=True.
       When loading all variants for a region then all variants should 
       have status 'not_loaded'=False.
    """
    adapter = real_populated_database
    variant_type = 'clinical'
    category = 'snv'
    ## GIVEN a database without any variants
    assert adapter.variant_collection.find().count() == 0
    
    institute_obj = adapter.institute_collection.find_one()
    institute_id = institute_obj['_id']
    
    ## WHEN loading variants into the database without updating compound information
    
    vcf_obj = VCF(variant_clinical_file)
    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)
    
    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    variants = []
    for i,variant in enumerate(vcf_obj):
        parsed_variant = parse_variant(
            variant=variant,
            case=case_obj,
            variant_type='clinical',
            rank_results_header=rank_results_header,
            vep_header=vep_header,
            individual_positions=individual_positions,
            category='snv',
        )
        
        variant_obj = build_variant(
            variant=parsed_variant,
            institute_id=institute_id,
        )
        variants.append(variant_obj)
    
    # Load all variants
    adapter.variant_collection.insert_many(variants)

    print("Nr variants: {0}".format(len(variants)))

    ## THEN assert that the variants does not have updated compound information
    nr_compounds = 0
    for var in adapter.variant_collection.find():
        if not var.get('compounds'):
            continue
        for comp in var['compounds']:
            if 'genes' in comp:
                assert False
            if 'not_loaded' in comp:
                assert False
            nr_compounds += 1
    
    assert nr_compounds > 0
    
    ## WHEN updating all compounds for a case
    adapter.update_case_compounds(case_obj)
    hgnc_ids = set([gene['hgnc_id'] for gene in adapter.all_genes()])

    nr_compounds = 0
    ## THEN assert that all compounds  (within the gene defenition) are updated
    for var in adapter.variant_collection.find():
        cont = False
        for hgnc_id in var['hgnc_ids']:
            if hgnc_id not in hgnc_ids:
                cont = True
        if cont:
            continue
        if not var.get('compounds'):
            continue
        for comp in var['compounds']:
            nr_compounds += 1
            if not 'genes' in comp:
                # pp(var)
                assert False
            if not 'not_loaded' in comp:
                assert False
    assert nr_compounds > 0