def test_parse_rank_score():
    rank_scores_info = "123:10"
    variant_score = 10.0
    family_id = '123'

    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)

    assert variant_score == parsed_rank_score
Example #2
0
def test_parse_rank_score_no_score():
    ## GIVEN a empty rank score string
    rank_scores_info = ""
    family_id = "123"
    ## WHEN parsing the rank score
    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)
    ## THEN assert that None is returned
    assert parsed_rank_score == None
def test_parse_rank_score():
    rank_scores_info = "123:10"
    variant_score = 10.0
    family_id = '123'
    
    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)
    
    assert variant_score == parsed_rank_score
Example #4
0
def test_parse_rank_score():
    ## GIVEN a rank score string on genmod format
    rank_scores_info = "123:10"
    variant_score = 10.0
    family_id = "123"
    ## WHEN parsing the rank score
    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)
    ## THEN assert that the correct rank score is parsed
    assert variant_score == parsed_rank_score
def test_parse_rank_score_no_score():
    rank_scores_info = ""
    family_id = '123'

    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)

    assert parsed_rank_score == None


# def test_parse_rank_scores(variants, parsed_case):
#     """docstring for test_parse_rank_score"""
#     case_id = parsed_case['display_name']
#     for variant in variants:
#         rank_scores_dict = variant['rank_scores']
#
#         rank_score = rank_scores_dict[case_id]
#
#         assert float(rank_score) == parse_rank_score(variant, case_id)
def test_parse_rank_score_no_score():
    rank_scores_info = ""
    family_id = '123'
    
    parsed_rank_score = parse_rank_score(rank_scores_info, family_id)
    
    assert parsed_rank_score == None


# def test_parse_rank_scores(variants, parsed_case):
#     """docstring for test_parse_rank_score"""
#     case_id = parsed_case['display_name']
#     for variant in variants:
#         rank_scores_dict = variant['rank_scores']
#
#         rank_score = rank_scores_dict[case_id]
#
#         assert float(rank_score) == parse_rank_score(variant, case_id)
Example #7
0
    def _load_variants(
        self,
        variants,
        variant_type,
        case_obj,
        individual_positions,
        rank_threshold,
        institute_id,
        build=None,
        rank_results_header=None,
        vep_header=None,
        category="snv",
        sample_info=None,
    ):
        """Perform the loading of variants

        This is the function that loops over the variants, parse them and build the variant
        objects so they are ready to be inserted into the database.

        Args:
            variants(iterable(cyvcf2.Variant))
            variant_type(str): ['clinical', 'research']
            case_obj(dict)
            individual_positions(dict): How individuals are positioned in vcf
            rank_treshold(int): Only load variants with a rank score > than this
            institute_id(str)
            build(str): Genome build
            rank_results_header(list): Rank score categories
            vep_header(list)
            category(str): ['snv','sv','cancer','str']
            sample_info(dict): A dictionary with info about samples.
                               Strictly for cancer to tell which is tumor

        Returns:
            nr_inserted(int)
        """
        build = build or "37"
        genes = [gene_obj for gene_obj in self.all_genes(build=build)]
        gene_to_panels = self.gene_to_panels(case_obj)
        hgncid_to_gene = self.hgncid_to_gene(genes=genes, build=build)
        genomic_intervals = self.get_coding_intervals(genes=genes)

        LOG.info("Start inserting {0} {1} variants into database".format(
            variant_type, category))
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets inserted
        nr_inserted = 0
        # This is to keep track of blocks of inserted variants
        inserted = 1

        nr_bulks = 0

        # We want to load batches of variants to reduce the number of network round trips
        bulk = {}
        current_region = None

        for nr_variants, variant in enumerate(variants):
            # All MT variants are loaded
            mt_variant = "MT" in variant.CHROM
            rank_score = parse_rank_score(variant.INFO.get("RankScore"),
                                          case_obj["_id"])
            pathogenic = is_pathogenic(variant)

            # Check if the variant should be loaded at all
            # if rank score is None means there are no rank scores annotated, all variants will be loaded
            # Otherwise we load all variants above a rank score treshold
            # Except for MT variants where we load all variants
            if ((rank_score is None) or (rank_score > rank_threshold)
                    or mt_variant or pathogenic):
                nr_inserted += 1
                # Parse the vcf variant
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header=vep_header,
                    individual_positions=individual_positions,
                    category=category,
                )

                # Build the variant object
                variant_obj = build_variant(
                    variant=parsed_variant,
                    institute_id=institute_id,
                    gene_to_panels=gene_to_panels,
                    hgncid_to_gene=hgncid_to_gene,
                    sample_info=sample_info,
                )

                # Check if the variant is in a genomic region
                var_chrom = variant_obj["chromosome"]
                var_start = variant_obj["position"]
                # We need to make sure that the interval has a length > 0
                var_end = variant_obj["end"] + 1
                var_id = variant_obj["_id"]
                # If the bulk should be loaded or not
                load = True
                new_region = None

                intervals = genomic_intervals.get(var_chrom, IntervalTree())
                genomic_regions = intervals.overlap(var_start, var_end)

                # If the variant is in a coding region
                if genomic_regions:
                    # We know there is data here so get the interval id
                    new_region = genomic_regions.pop().data
                    # If the variant is in the same region as previous
                    # we add it to the same bulk
                    if new_region == current_region:
                        load = False

                # This is the case where the variant is intergenic
                else:
                    # If the previous variant was also intergenic we add the variant to the bulk
                    if not current_region:
                        load = False
                    # We need to have a max size of the bulk
                    if len(bulk) > 10000:
                        load = True
                # Load the variant object
                if load:
                    # If the variant bulk contains coding variants we want to update the compounds
                    if current_region:
                        self.update_compounds(bulk)
                    try:
                        # Load the variants
                        self.load_variant_bulk(list(bulk.values()))
                        nr_bulks += 1
                    except IntegrityError as error:
                        pass
                    bulk = {}

                current_region = new_region
                bulk[var_id] = variant_obj

                if nr_variants != 0 and nr_variants % 5000 == 0:
                    LOG.info("%s variants parsed", str(nr_variants))
                    LOG.info(
                        "Time to parse variants: %s",
                        (datetime.now() - start_five_thousand),
                    )
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0
                        and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    LOG.info("%s variants inserted", nr_inserted)
                    inserted += 1
        # If the variants are in a coding region we update the compounds
        if current_region:
            self.update_compounds(bulk)

        # Load the final variant bulk
        self.load_variant_bulk(list(bulk.values()))
        nr_bulks += 1
        LOG.info("All variants inserted, time to insert variants: {0}".format(
            datetime.now() - start_insertion))

        if nr_variants:
            nr_variants += 1
        LOG.info("Nr variants parsed: %s", nr_variants)
        LOG.info("Nr variants inserted: %s", nr_inserted)
        LOG.debug("Nr bulks inserted: %s", nr_bulks)

        return nr_inserted
Example #8
0
    def _load_variants(self, variants, variant_type, case_obj, individual_positions, rank_threshold,
                       institute_id, build=None, rank_results_header=None, vep_header=None,
                       category='snv', sample_info = None):
        """Perform the loading of variants

        This is the function that loops over the variants, parse them and build the variant
        objects so they are ready to be inserted into the database.

        """
        build = build or '37'
        genes = [gene_obj for gene_obj in self.all_genes(build=build)]
        gene_to_panels = self.gene_to_panels(case_obj)
        hgncid_to_gene = self.hgncid_to_gene(genes=genes)
        genomic_intervals = self.get_coding_intervals(genes=genes)

        LOG.info("Start inserting {0} {1} variants into database".format(variant_type, category))
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets inserted
        nr_inserted = 0
        # This is to keep track of blocks of inserted variants
        inserted = 1

        nr_bulks = 0

        # We want to load batches of variants to reduce the number of network round trips
        bulk = {}
        current_region = None

        for nr_variants, variant in enumerate(variants):
            # All MT variants are loaded
            mt_variant = 'MT' in variant.CHROM
            rank_score = parse_rank_score(variant.INFO.get('RankScore'), case_obj['_id'])

            # Check if the variant should be loaded at all
            # if rank score is None means there are no rank scores annotated, all variants will be loaded
            # Otherwise we load all variants above a rank score treshold
            # Except for MT variants where we load all variants
            if (rank_score is None) or (rank_score > rank_threshold) or mt_variant:
                nr_inserted += 1
                # Parse the vcf variant
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header=vep_header,
                    individual_positions=individual_positions,
                    category=category,
                )

                # Build the variant object
                variant_obj = build_variant(
                    variant=parsed_variant,
                    institute_id=institute_id,
                    gene_to_panels=gene_to_panels,
                    hgncid_to_gene=hgncid_to_gene,
                    sample_info=sample_info
                )

                # Check if the variant is in a genomic region
                var_chrom = variant_obj['chromosome']
                var_start = variant_obj['position']
                # We need to make sure that the interval has a length > 0
                var_end = variant_obj['end'] + 1
                var_id = variant_obj['_id']
                # If the bulk should be loaded or not
                load = True
                new_region = None

                genomic_regions = genomic_intervals.get(var_chrom, IntervalTree()).search(var_start, var_end)

                # If the variant is in a coding region
                if genomic_regions:
                    # We know there is data here so get the interval id
                    new_region = genomic_regions.pop().data
                    # If the variant is in the same region as previous
                    # we add it to the same bulk
                    if new_region == current_region:
                        load = False

                # This is the case where the variant is intergenic
                else:
                    # If the previous variant was also intergenic we add the variant to the bulk
                    if not current_region:
                        load = False
                    # We need to have a max size of the bulk
                    if len(bulk) > 10000:
                        load = True
                # Load the variant object
                if load:
                    # If the variant bulk contains coding variants we want to update the compounds
                    if current_region:
                        self.update_compounds(bulk)
                    try:
                        # Load the variants
                        self.load_variant_bulk(list(bulk.values()))
                        nr_bulks += 1
                    except IntegrityError as error:
                        pass
                    bulk = {}

                current_region = new_region
                bulk[var_id] = variant_obj

                if (nr_variants != 0 and nr_variants % 5000 == 0):
                    LOG.info("%s variants parsed", str(nr_variants))
                    LOG.info("Time to parse variants: %s",
                                (datetime.now() - start_five_thousand))
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    LOG.info("%s variants inserted", nr_inserted)
                    inserted += 1
        # If the variants are in a coding region we update the compounds
        if current_region:
            self.update_compounds(bulk)

        # Load the final variant bulk
        self.load_variant_bulk(list(bulk.values()))
        nr_bulks += 1
        LOG.info("All variants inserted, time to insert variants: {0}".format(
            datetime.now() - start_insertion))

        if nr_variants:
            nr_variants += 1
        LOG.info("Nr variants parsed: %s", nr_variants)
        LOG.info("Nr variants inserted: %s", nr_inserted)
        LOG.debug("Nr bulks inserted: %s", nr_bulks)

        return nr_inserted
Example #9
0
def load_variants(adapter, variant_file, case_obj, variant_type='clinical',
                  category='snv', rank_threshold=5, chrom=None, start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i,ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    logger.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    nr_variants = 0
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {
            'chrom': chrom,
            'start': start,
            'end': end
        }

    try:
        for nr_variants, variant in enumerate(vcf_obj):
            rank_score = parse_rank_score(
                variant.INFO.get('RankScore'),
                case_obj['display_name']
            )
            variant_obj = None
            add_variant = False

            if coordinates or (rank_score > rank_threshold):
                parsed_variant = parse_variant(
                    variant=variant,
                    case=case_obj,
                    variant_type=variant_type,
                    rank_results_header=rank_results_header,
                    vep_header = vep_header,
                    individual_positions = individual_positions
                )
                add_variant = True
                # If there are coordinates the variant should be loaded
                if coordinates:
                    if not check_coordinates(parsed_variant, coordinates):
                        add_variant = False

                if add_variant:
                    variant_obj = build_variant(
                        variant=parsed_variant,
                        institute_id=institute_obj['_id'],
                        gene_to_panels=gene_to_panels,
                        hgncid_to_gene=hgncid_to_gene,
                    )
                    try:
                        load_variant(adapter, variant_obj)
                        nr_inserted += 1
                    except IntegrityError as error:
                        pass

                if (nr_variants != 0 and nr_variants % 5000 == 0):
                    logger.info("%s variants parsed" % str(nr_variants))
                    logger.info("Time to parse variants: {} ".format(
                                datetime.now() - start_five_thousand))
                    start_five_thousand = datetime.now()

                if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                    logger.info("%s variants inserted" % nr_inserted)
                    inserted += 1

    except Exception as error:
        if not coordinates:
            logger.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    logger.info("All variants inserted.")
    logger.info("Number of variants in file: {0}".format(nr_variants + 1))
    logger.info("Number of variants inserted: {0}".format(nr_inserted))
    logger.info("Time to insert variants:{0}".format(datetime.now() - start_insertion))
Example #10
0
    def load_variants(self, case_obj, variant_type='clinical', category='snv',
                      rank_threshold=None, chrom=None, start=None, end=None,
                      gene_obj=None):
        """Load variants for a case into scout.

        Load all variants for a specific analysis type and category into scout.
        If no region is specified, load all variants above rank score threshold
        If region or gene is specified, load all variants from that region
        disregarding variant rank(if not specified)

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 5
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            nr_inserted(int)
        """
        institute_obj = self.institute(institute_id=case_obj['owner'])

        if not institute_obj:
            raise IntegrityError("Institute {0} does not exist in"
                                 " database.".format(case_obj['owner']))
        gene_to_panels = self.gene_to_panels()

        hgncid_to_gene = self.hgncid_to_gene()

        variant_file = None
        if variant_type == 'clinical':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv')
        elif variant_type == 'research':
            if category == 'snv':
                variant_file = case_obj['vcf_files'].get('vcf_snv_research')
            elif category == 'sv':
                variant_file = case_obj['vcf_files'].get('vcf_sv_research')

        if not variant_file:
            raise SyntaxError("Vcf file does not seem to exist")

        vcf_obj = VCF(variant_file)

        # Parse the neccessary headers from vcf file
        rank_results_header = parse_rank_results_header(vcf_obj)
        vep_header = parse_vep_header(vcf_obj)

        # This is a dictionary to tell where ind are in vcf
        individual_positions = {}
        for i,ind in enumerate(vcf_obj.samples):
            individual_positions[ind] = i

        # Check if a region scould be uploaded
        region = ""
        if gene_obj:
            chrom = gene_obj['chromosome']
            start = gene_obj['start']
            end = gene_obj['end']
        if chrom:
            rank_threshold = rank_threshold or -100
            if not (start and end):
                raise SyntaxError("Specify chrom start and end")
            region = "{0}:{1}-{2}".format(chrom, start, end)
        else:
            rank_threshold = rank_threshold or 5

        logger.info("Start inserting variants into database")
        start_insertion = datetime.now()
        start_five_thousand = datetime.now()
        # These are the number of parsed varaints
        nr_variants = 0
        # These are the number of variants that meet the criteria and gets
        # inserted
        nr_inserted = 0
        # This is to keep track of the inserted variants
        inserted = 1

        try:
            for nr_variants, variant in enumerate(vcf_obj(region)):
                rank_score = parse_rank_score(
                    variant.INFO.get('RankScore'),
                    case_obj['display_name']
                )

                if rank_score > rank_threshold:
                    # Parse the vcf variant
                    parsed_variant = parse_variant(
                        variant=variant,
                        case=case_obj,
                        variant_type=variant_type,
                        rank_results_header=rank_results_header,
                        vep_header = vep_header,
                        individual_positions = individual_positions
                    )
                    # Build the variant object
                    variant_obj = build_variant(
                        variant=parsed_variant,
                        institute_id=institute_obj['_id'],
                        gene_to_panels=gene_to_panels,
                        hgncid_to_gene=hgncid_to_gene,
                    )
                    try:
                        self.load_variant(variant_obj)
                        nr_inserted += 1
                    except IntegrityError as error:
                        pass

                    if (nr_variants != 0 and nr_variants % 5000 == 0):
                        logger.info("%s variants parsed" % str(nr_variants))
                        logger.info("Time to parse variants: {} ".format(
                                    datetime.now() - start_five_thousand))
                        start_five_thousand = datetime.now()

                    if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0):
                        logger.info("%s variants inserted" % nr_inserted)
                        inserted += 1

        except Exception as error:
            logger.warning("Deleting inserted variants")
            self.delete_variants(case_obj['_id'], variant_type)
            raise error

        return nr_inserted
Example #11
0
def load_variants(adapter,
                  variant_file,
                  case_obj,
                  variant_type='clinical',
                  category='snv',
                  rank_threshold=6,
                  chrom=None,
                  start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    LOG.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    # To get it right if the file is empty
    nr_variants = -1
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {'chrom': chrom, 'start': start, 'end': end}

    try:
        for nr_variants, variant in enumerate(vcf_obj):

            # Get the neccesary coordinates
            # Parse away any chr CHR prefix
            chrom_match = CHR_PATTERN.match(variant.CHROM)
            chrom = chrom_match.group(2)
            position = variant.POS

            add_variant = False

            # If coordinates are specified we want to upload all variants that
            # resides within the specified region
            if coordinates:
                if check_coordinates(chrom, position, coordinates):
                    add_variant = True
            # If there are no coordinates we allways want to load MT variants
            elif chrom == 'MT':
                add_variant = True
            # Otherwise we need to check is rank score requirement are fulfilled
            else:
                rank_score = parse_rank_score(variant.INFO.get('RankScore'),
                                              case_obj['display_name'])
                if rank_score >= rank_threshold:
                    add_variant = True
            variant_obj = None

            # Log the number of variants parsed
            if (nr_variants != 0 and nr_variants % 5000 == 0):
                LOG.info("%s variants parsed" % str(nr_variants))
                LOG.info(
                    "Time to parse variants: {} ".format(datetime.now() -
                                                         start_five_thousand))
                start_five_thousand = datetime.now()

            if not add_variant:
                continue

            ####### Here we know that the variant should be loaded #########
            # We follow the scout paradigm of parse -> build -> load

            # Parse the variant
            parsed_variant = parse_variant(
                variant=variant,
                case=case_obj,
                variant_type=variant_type,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                individual_positions=individual_positions)

            # Build the variant object
            variant_obj = build_variant(
                variant=parsed_variant,
                institute_id=institute_obj['_id'],
                gene_to_panels=gene_to_panels,
                hgncid_to_gene=hgncid_to_gene,
            )

            # Load the variant abject
            # We could get integrity error here since if we want to load all variants of a region
            # there will likely already be variants from that region loaded
            try:
                load_variant(adapter, variant_obj)
                nr_inserted += 1
            except IntegrityError as error:
                pass

            # Log number of inserted variants
            if (nr_inserted != 0
                    and (nr_inserted * inserted) % (1000 * inserted) == 0):
                LOG.info("%s variants inserted" % nr_inserted)
                inserted += 1

    except Exception as error:
        if not coordinates:
            LOG.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    LOG.info("All variants inserted.")
    LOG.info("Number of variants in file: {0}".format(nr_variants + 1))
    LOG.info("Number of variants inserted: {0}".format(nr_inserted))
    LOG.info("Time to insert variants:{0}".format(datetime.now() -
                                                  start_insertion))